Spaces:
Running
Running
Log google drive documents in the mongodb, add source of the document and made chunks to overlap text.
acdfaa9
# src/implementations/document_service.py | |
from fastapi import HTTPException | |
from pathlib import Path | |
import shutil | |
import os | |
from uuid import uuid4 | |
from typing import List, Tuple, Dict | |
from fastapi import UploadFile, BackgroundTasks | |
from src.vectorstores.chroma_vectorstore import ChromaVectorStore | |
from src.utils.document_processor import DocumentProcessor | |
from src.models import DocumentResponse, DocumentInfo, BatchUploadResponse | |
from src.utils.logger import logger | |
from src.db.mongodb_store import MongoDBStore | |
class DocumentService: | |
def __init__( | |
self, | |
doc_processor: DocumentProcessor, | |
mongodb: MongoDBStore | |
): | |
self.doc_processor = doc_processor | |
self.mongodb = mongodb | |
self.permanent_dir = Path("uploads") | |
self.permanent_dir.mkdir(exist_ok=True) | |
async def check_duplicate_filename(self, filename: str) -> bool: | |
""" | |
Check if a file with the same name exists | |
Args: | |
filename (str): Original filename to check | |
Returns: | |
bool: True if duplicate exists, False otherwise | |
""" | |
documents = await self.mongodb.get_all_documents() | |
return any(doc.get('filename') == filename for doc in documents) | |
async def process_documents( | |
self, | |
files: List[UploadFile], | |
vector_store: ChromaVectorStore, | |
background_tasks: BackgroundTasks | |
) -> BatchUploadResponse: | |
"""Process multiple document uploads""" | |
processed_files, failed_files = await self._handle_file_uploads( | |
files, | |
vector_store, | |
background_tasks | |
) | |
return BatchUploadResponse( | |
message=f"Processed {len(processed_files)} documents with {len(failed_files)} failures", | |
processed_files=processed_files, | |
failed_files=failed_files | |
) | |
async def _handle_file_uploads( | |
self, | |
files: List[UploadFile], | |
vector_store: ChromaVectorStore, | |
background_tasks: BackgroundTasks | |
) -> Tuple[List[DocumentResponse], List[dict]]: | |
"""Handle individual file uploads and processing""" | |
processed_files = [] | |
failed_files = [] | |
for file in files: | |
try: | |
# Check for duplicate filename | |
if await self.check_duplicate_filename(file.filename): | |
failed_files.append(self._create_failed_file_entry( | |
file.filename, | |
"A document with this name already exists. Please upload another document." | |
)) | |
continue | |
if not self._is_supported_format(file.filename): | |
failed_files.append(self._create_failed_file_entry( | |
file.filename, | |
"Unsupported file format" | |
)) | |
continue | |
document_response = await self._process_single_file( | |
file, | |
vector_store, | |
background_tasks | |
) | |
processed_files.append(document_response) | |
except Exception as e: | |
logger.error( | |
f"Error processing file {file.filename}: {str(e)}") | |
failed_files.append(self._create_failed_file_entry( | |
file.filename, | |
str(e) | |
)) | |
return processed_files, failed_files | |
async def _process_single_file( | |
self, | |
file: UploadFile, | |
vector_store: ChromaVectorStore, | |
background_tasks: BackgroundTasks | |
) -> DocumentResponse: | |
"""Process a single file upload with proper handle closure""" | |
document_id = str(uuid4()) | |
filename = f"{document_id}_{file.filename}" | |
file_path = self.permanent_dir / filename | |
url_path = f"/docs/{filename}" | |
try: | |
# Save file to permanent location using a context manager | |
with open(file_path, "wb") as buffer: | |
shutil.copyfileobj(file.file, buffer) | |
# Close the uploaded file explicitly | |
await file.close() | |
# Process document with proper cleanup for Excel files | |
try: | |
processed_doc = await self.doc_processor.process_document(file_path) | |
# For Excel files, ensure pandas closes the file | |
if file_path.suffix.lower() in ['.xlsx', '.xls']: | |
import gc | |
gc.collect() # Help cleanup any lingering file handles | |
except Exception as proc_error: | |
logger.error(f"Error processing document: {str(proc_error)}") | |
raise | |
# Store in MongoDB with url_path | |
await self.mongodb.store_document( | |
document_id=document_id, | |
filename=file.filename, | |
content_type=file.content_type, | |
file_size=os.path.getsize(file_path), | |
url_path=url_path, | |
source="user_upload" | |
) | |
# Process for vector store in background | |
background_tasks.add_task( | |
self._process_for_vector_store, | |
processed_doc['chunks'], | |
vector_store, | |
document_id, | |
file.filename | |
) | |
return DocumentResponse( | |
message="Document uploaded successfully", | |
document_id=document_id, | |
status="processing", | |
document_info=DocumentInfo( | |
original_filename=file.filename, | |
size=os.path.getsize(file_path), | |
content_type=file.content_type, | |
url_path=url_path | |
) | |
) | |
except Exception as e: | |
# Clean up file if it was created | |
if file_path.exists(): | |
try: | |
file_path.unlink() | |
except Exception as cleanup_error: | |
logger.error( | |
f"Error cleaning up file {file_path}: {str(cleanup_error)}") | |
# Clean up from MongoDB if document was created | |
try: | |
await self.mongodb.delete_document(document_id) | |
except Exception as db_cleanup_error: | |
logger.error( | |
f"Error cleaning up MongoDB document {document_id}: {str(db_cleanup_error)}") | |
logger.error(f"Error processing file {file.filename}: {str(e)}") | |
raise | |
async def _process_for_vector_store( | |
self, | |
chunks: List[str], | |
vector_store: ChromaVectorStore, | |
document_id: str, | |
filename: str | |
): | |
"""Process document content for vector store""" | |
try: | |
# Generate chunk IDs using document_id | |
chunk_ids = [ | |
f"{document_id}-chunk-{i}" for i in range(len(chunks))] | |
# Get embeddings | |
embeddings = vector_store.embedding_function(chunks) | |
# Prepare metadata for each chunk | |
metadatas = [{ | |
'document_id': document_id, | |
'source_file': filename, | |
'chunk_index': i, | |
'total_chunks': len(chunks) | |
} for i in range(len(chunks))] | |
# Store in vector store | |
vector_store.add_documents( | |
documents=chunks, | |
embeddings=embeddings, | |
metadatas=metadatas, | |
ids=chunk_ids | |
) | |
logger.info( | |
f"Successfully processed document {filename} (ID: {document_id}) into {len(chunks)} chunks") | |
except Exception as e: | |
logger.error( | |
f"Error processing document {filename} (ID: {document_id}) for vector store: {str(e)}") | |
raise | |
def _is_supported_format(self, filename: str) -> bool: | |
"""Check if file format is supported""" | |
return any(filename.lower().endswith(ext) | |
for ext in self.doc_processor.supported_formats) | |
def _create_failed_file_entry(self, filename: str, error: str) -> dict: | |
"""Create a failed file entry""" | |
return { | |
"filename": filename, | |
"error": error | |
} | |
async def delete_document(self, document_id: str) -> bool: | |
"""Delete document from storage and MongoDB""" | |
try: | |
# Get document details from MongoDB | |
doc = await self.mongodb.get_document(document_id) | |
if doc: | |
# Get filename from url_path | |
filename = doc['url_path'].split('/')[-1] | |
file_path = self.permanent_dir / filename | |
# Delete physical file if it exists | |
if file_path.exists(): | |
file_path.unlink() | |
# Delete from MongoDB | |
return await self.mongodb.delete_document(document_id) | |
return False | |
except Exception as e: | |
logger.error(f"Error deleting document: {str(e)}") | |
raise | |
def cleanup(self): | |
"""Clean up permanent directory if empty""" | |
if self.permanent_dir.exists() and not any(self.permanent_dir.iterdir()): | |
self.permanent_dir.rmdir() | |