Spaces:

TalatMasud
/

chatbot-backend

Running

App Files Files Community

chatbot-backend / src /implementations /document_service.py

TalatMasood

Update knowledge upload api and linked chromadb to mongodb

d161383 5 months ago

raw

history blame

6.55 kB

	# src/implementations/document_service.py
	from pathlib import Path
	import shutil
	import os
	from uuid import uuid4
	from typing import List, Tuple, Dict
	from fastapi import UploadFile, BackgroundTasks

	from src.vectorstores.chroma_vectorstore import ChromaVectorStore
	from src.utils.document_processor import DocumentProcessor
	from src.models import DocumentResponse, DocumentInfo, BatchUploadResponse
	from src.utils.logger import logger
	from src.db.mongodb_store import MongoDBStore

	class DocumentService:
	def __init__(
	self,
	doc_processor: DocumentProcessor,
	mongodb: MongoDBStore
	):
	self.doc_processor = doc_processor
	self.mongodb = mongodb
	self.upload_dir = Path("temp_uploads")
	self.upload_dir.mkdir(exist_ok=True)

	async def process_documents(
	self,
	files: List[UploadFile],
	vector_store: ChromaVectorStore,
	background_tasks: BackgroundTasks
	) -> BatchUploadResponse:
	"""Process multiple document uploads"""
	processed_files, failed_files = await self._handle_file_uploads(
	files,
	vector_store,
	background_tasks
	)

	return BatchUploadResponse(
	message=f"Processed {len(processed_files)} documents with {len(failed_files)} failures",
	processed_files=processed_files,
	failed_files=failed_files
	)

	async def _handle_file_uploads(
	self,
	files: List[UploadFile],
	vector_store: ChromaVectorStore,
	background_tasks: BackgroundTasks
	) -> Tuple[List[DocumentResponse], List[dict]]:
	"""Handle individual file uploads and processing"""
	processed_files = []
	failed_files = []

	for file in files:
	try:
	if not self._is_supported_format(file.filename):
	failed_files.append(self._create_failed_file_entry(
	file.filename,
	"Unsupported file format"
	))
	continue

	document_response = await self._process_single_file(
	file,
	vector_store,
	background_tasks
	)
	processed_files.append(document_response)

	except Exception as e:
	logger.error(f"Error processing file {file.filename}: {str(e)}")
	failed_files.append(self._create_failed_file_entry(
	file.filename,
	str(e)
	))

	return processed_files, failed_files

	async def _process_single_file(
	self,
	file: UploadFile,
	vector_store: ChromaVectorStore,
	background_tasks: BackgroundTasks
	) -> DocumentResponse:
	"""Process a single file upload"""
	# Generate UUID for document
	document_id = str(uuid4())
	temp_path = self.upload_dir / f"{document_id}_{file.filename}"

	try:
	# Save file temporarily
	with open(temp_path, "wb") as buffer:
	shutil.copyfileobj(file.file, buffer)

	# Process the document to get content and metadata
	processed_doc = await self.doc_processor.process_document(temp_path)
	content = processed_doc['content']

	# First, store in MongoDB
	await self.mongodb.store_document(
	document_id=document_id,
	filename=file.filename,
	content=content,
	content_type=file.content_type,
	file_size=os.path.getsize(temp_path)
	)

	# Then process for vector store in background
	background_tasks.add_task(
	self._process_for_vector_store,
	processed_doc['chunks'], # Use the chunks from processed document
	vector_store,
	document_id,
	file.filename
	)

	return DocumentResponse(
	message="Document uploaded successfully",
	document_id=document_id,
	status="processing",
	document_info=DocumentInfo(
	original_filename=file.filename,
	size=os.path.getsize(temp_path),
	content_type=file.content_type
	)
	)
	finally:
	# Clean up temporary file
	if temp_path.exists():
	temp_path.unlink()

	async def _process_for_vector_store(
	self,
	chunks: List[str], # Now accepting pre-processed chunks
	vector_store: ChromaVectorStore,
	document_id: str,
	filename: str
	):
	"""Process document content for vector store"""
	try:
	# Generate chunk IDs using document_id
	chunk_ids = [f"{document_id}-chunk-{i}" for i in range(len(chunks))]

	# Get embeddings
	embeddings = vector_store.embedding_function(chunks)

	# Prepare metadata for each chunk
	metadatas = [{
	'document_id': document_id, # MongoDB document ID
	'source_file': filename,
	'chunk_index': i,
	'total_chunks': len(chunks)
	} for i in range(len(chunks))]

	# Store in vector store
	vector_store.add_documents(
	documents=chunks,
	embeddings=embeddings,
	metadatas=metadatas,
	ids=chunk_ids
	)

	logger.info(f"Successfully processed document {filename} (ID: {document_id}) into {len(chunks)} chunks")

	except Exception as e:
	logger.error(f"Error processing document {filename} (ID: {document_id}) for vector store: {str(e)}")
	raise

	def _is_supported_format(self, filename: str) -> bool:
	"""Check if file format is supported"""
	return any(filename.lower().endswith(ext)
	for ext in self.doc_processor.supported_formats)

	def _create_failed_file_entry(self, filename: str, error: str) -> dict:
	"""Create a failed file entry"""
	return {
	"filename": filename,
	"error": error
	}

	def cleanup(self):
	"""Clean up upload directory"""
	if self.upload_dir.exists() and not any(self.upload_dir.iterdir()):
	self.upload_dir.rmdir()