Spaces:

TalatMasud
/

chatbot-backend

Sleeping

App Files Files Community

TalatMasood commited on Jan 31

Commit

d161383

1 Parent(s): e9d730a

Update knowledge upload api and linked chromadb to mongodb

Browse files

Files changed (14) hide show

.vscode/launch.json +2 -2
DocKnowledge-based chatbot.docx +0 -0
src/__pycache__/main.cpython-312.pyc +0 -0
src/db/__pycache__/mongodb_store.cpython-312.pyc +0 -0
src/db/mongodb_store.py +35 -1
src/implementations/__pycache__/document_service.cpython-312.pyc +0 -0
src/implementations/document_service.py +88 -50
src/main.py +52 -4
src/models/__pycache__/document.cpython-312.pyc +0 -0
src/models/document.py +14 -2
src/vectorstores/__pycache__/base_vectorstore.cpython-312.pyc +0 -0
src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc +0 -0
src/vectorstores/base_vectorstore.py +23 -4
src/vectorstores/chroma_vectorstore.py +168 -25

.vscode/launch.json CHANGED Viewed

@@ -2,7 +2,7 @@
     "version": "0.2.0",
     "configurations": [
         {
-            "name": "Python: FastAPI",
             "type": "python",
             "request": "launch",
             "module": "uvicorn",
@@ -17,7 +17,7 @@
             }
         },
         {
-            "name": "Python: Test",
             "type": "python",
             "request": "launch",
             "module": "pytest",

     "version": "0.2.0",
     "configurations": [
         {
+            "name": "Chatbot",
             "type": "python",
             "request": "launch",
             "module": "uvicorn",
             }
         },
         {
+            "name": "Chatbot: Tests",
             "type": "python",
             "request": "launch",
             "module": "pytest",

DocKnowledge-based chatbot.docx ADDED Viewed

Binary file (16.9 kB). View file

src/__pycache__/main.cpython-312.pyc CHANGED Viewed

Binary files a/src/__pycache__/main.cpython-312.pyc and b/src/__pycache__/main.cpython-312.pyc differ

src/db/__pycache__/mongodb_store.cpython-312.pyc CHANGED Viewed

Binary files a/src/db/__pycache__/mongodb_store.cpython-312.pyc and b/src/db/__pycache__/mongodb_store.cpython-312.pyc differ

src/db/mongodb_store.py CHANGED Viewed

@@ -2,7 +2,7 @@
 from motor.motor_asyncio import AsyncIOMotorClient
 from datetime import datetime
 import json
-from typing import List, Dict, Optional
 from bson import ObjectId
 class MongoDBStore:
@@ -11,6 +11,40 @@ class MongoDBStore:
         self.client = AsyncIOMotorClient(mongo_uri)
         self.db = self.client.rag_chatbot
         self.chat_history = self.db.chat_history
     async def store_message(
         self,

 from motor.motor_asyncio import AsyncIOMotorClient
 from datetime import datetime
 import json
+from typing import List, Dict, Optional, Any
 from bson import ObjectId
 class MongoDBStore:
         self.client = AsyncIOMotorClient(mongo_uri)
         self.db = self.client.rag_chatbot
         self.chat_history = self.db.chat_history
+        self.documents = self.db.documents  # Collection for original documents
+    async def store_document(
+        self,
+        document_id: str,
+        filename: str,
+        content: str,
+        content_type: str,
+        file_size: int
+    ) -> str:
+        """Store original document in MongoDB"""
+        document = {
+            "document_id": document_id,
+            "filename": filename,
+            "content": content,
+            "content_type": content_type,
+            "file_size": file_size,
+            "upload_timestamp": datetime.now()
+        }
+        await self.documents.insert_one(document)
+        return document_id
+    async def get_document(self, document_id: str) -> Optional[Dict]:
+        """Retrieve document by ID"""
+        return await self.documents.find_one(
+            {"document_id": document_id},
+            {"_id": 0}  # Exclude MongoDB's _id
+        )
+    async def get_all_documents(self) -> List[Dict]:
+        """Retrieve all documents"""
+        cursor = self.documents.find({}, {"_id": 0})
+        return await cursor.to_list(length=None)
     async def store_message(
         self,

src/implementations/__pycache__/document_service.cpython-312.pyc CHANGED Viewed

Binary files a/src/implementations/__pycache__/document_service.cpython-312.pyc and b/src/implementations/__pycache__/document_service.cpython-312.pyc differ

src/implementations/document_service.py CHANGED Viewed

@@ -2,17 +2,24 @@
 from pathlib import Path
 import shutil
 import os
-import uuid
-from typing import List, Tuple
 from fastapi import UploadFile, BackgroundTasks
-from ..vectorstores.chroma_vectorstore import ChromaVectorStore
-from ..utils.document_processor import DocumentProcessor
-from ..models import DocumentResponse, DocumentInfo, BatchUploadResponse
-from ..utils.logger import logger
 class DocumentService:
-    def __init__(self, doc_processor: DocumentProcessor):
         self.doc_processor = doc_processor
         self.upload_dir = Path("temp_uploads")
         self.upload_dir.mkdir(exist_ok=True)
@@ -70,11 +77,6 @@ class DocumentService:
         return processed_files, failed_files
-    def _is_supported_format(self, filename: str) -> bool:
-        """Check if file format is supported"""
-        return any(filename.lower().endswith(ext)
-                  for ext in self.doc_processor.supported_formats)
     async def _process_single_file(
         self,
         file: UploadFile,
@@ -82,57 +84,93 @@ class DocumentService:
         background_tasks: BackgroundTasks
     ) -> DocumentResponse:
         """Process a single file upload"""
-        document_id = str(uuid.uuid4())
         temp_path = self.upload_dir / f"{document_id}_{file.filename}"
-        # Save file
-        with open(temp_path, "wb") as buffer:
-            shutil.copyfileobj(file.file, buffer)
-        # Add background task for processing
-        background_tasks.add_task(
-            self._process_and_store_document,
-            temp_path,
-            vector_store,
-            document_id
-        )
-        return DocumentResponse(
-            message="Document queued for processing",
-            document_id=document_id,
-            status="processing",
-            document_info=DocumentInfo(
-                original_filename=file.filename,
-                size=os.path.getsize(temp_path),
-                content_type=file.content_type
             )
-        )
-    async def _process_and_store_document(
         self,
-        file_path: Path,
         vector_store: ChromaVectorStore,
-        document_id: str
     ):
-        """Process document and store in vector database"""
         try:
-            processed_doc = await self.doc_processor.process_document(file_path)
             vector_store.add_documents(
-                documents=processed_doc['chunks'],
-                metadatas=[{
-                    'document_id': document_id,
-                    'chunk_id': i,
-                    'source': str(file_path.name),
-                    'metadata': processed_doc['metadata']
-                } for i in range(len(processed_doc['chunks']))],
-                ids=[f"{document_id}_chunk_{i}" for i in range(len(processed_doc['chunks']))]
             )
-            return processed_doc
-        finally:
-            if file_path.exists():
-                file_path.unlink()
     def _create_failed_file_entry(self, filename: str, error: str) -> dict:
         """Create a failed file entry"""

 from pathlib import Path
 import shutil
 import os
+from uuid import uuid4
+from typing import List, Tuple, Dict
 from fastapi import UploadFile, BackgroundTasks
+from src.vectorstores.chroma_vectorstore import ChromaVectorStore
+from src.utils.document_processor import DocumentProcessor
+from src.models import DocumentResponse, DocumentInfo, BatchUploadResponse
+from src.utils.logger import logger
+from src.db.mongodb_store import MongoDBStore
 class DocumentService:
+    def __init__(
+        self,
+        doc_processor: DocumentProcessor,
+        mongodb: MongoDBStore
+    ):
         self.doc_processor = doc_processor
+        self.mongodb = mongodb
         self.upload_dir = Path("temp_uploads")
         self.upload_dir.mkdir(exist_ok=True)
         return processed_files, failed_files
     async def _process_single_file(
         self,
         file: UploadFile,
         background_tasks: BackgroundTasks
     ) -> DocumentResponse:
         """Process a single file upload"""
+        # Generate UUID for document
+        document_id = str(uuid4())
         temp_path = self.upload_dir / f"{document_id}_{file.filename}"
+        try:
+            # Save file temporarily
+            with open(temp_path, "wb") as buffer:
+                shutil.copyfileobj(file.file, buffer)
+            # Process the document to get content and metadata
+            processed_doc = await self.doc_processor.process_document(temp_path)
+            content = processed_doc['content']
+            # First, store in MongoDB
+            await self.mongodb.store_document(
+                document_id=document_id,
+                filename=file.filename,
+                content=content,
+                content_type=file.content_type,
+                file_size=os.path.getsize(temp_path)
+            )
+            # Then process for vector store in background
+            background_tasks.add_task(
+                self._process_for_vector_store,
+                processed_doc['chunks'],  # Use the chunks from processed document
+                vector_store,
+                document_id,
+                file.filename
             )
+            return DocumentResponse(
+                message="Document uploaded successfully",
+                document_id=document_id,
+                status="processing",
+                document_info=DocumentInfo(
+                    original_filename=file.filename,
+                    size=os.path.getsize(temp_path),
+                    content_type=file.content_type
+                )
+            )
+        finally:
+            # Clean up temporary file
+            if temp_path.exists():
+                temp_path.unlink()
+    async def _process_for_vector_store(
         self,
+        chunks: List[str],  # Now accepting pre-processed chunks
         vector_store: ChromaVectorStore,
+        document_id: str,
+        filename: str
     ):
+        """Process document content for vector store"""
         try:
+            # Generate chunk IDs using document_id
+            chunk_ids = [f"{document_id}-chunk-{i}" for i in range(len(chunks))]
+            # Get embeddings
+            embeddings = vector_store.embedding_function(chunks)
+            # Prepare metadata for each chunk
+            metadatas = [{
+                'document_id': document_id,  # MongoDB document ID
+                'source_file': filename,
+                'chunk_index': i,
+                'total_chunks': len(chunks)
+            } for i in range(len(chunks))]
+            # Store in vector store
             vector_store.add_documents(
+                documents=chunks,
+                embeddings=embeddings,
+                metadatas=metadatas,
+                ids=chunk_ids
             )
+            logger.info(f"Successfully processed document {filename} (ID: {document_id}) into {len(chunks)} chunks")
+        except Exception as e:
+            logger.error(f"Error processing document {filename} (ID: {document_id}) for vector store: {str(e)}")
+            raise
+    def _is_supported_format(self, filename: str) -> bool:
+        """Check if file format is supported"""
+        return any(filename.lower().endswith(ext)
+                  for ext in self.doc_processor.supported_formats)
     def _create_failed_file_entry(self, filename: str, error: str) -> dict:
         """Create a failed file entry"""

src/main.py CHANGED Viewed

@@ -7,6 +7,7 @@ from datetime import datetime
 # Import custom modules
 from src.agents.rag_agent import RAGAgent
 from src.utils.document_processor import DocumentProcessor
 from src.utils.conversation_summarizer import ConversationSummarizer
 from src.utils.logger import logger
@@ -16,6 +17,7 @@ from src.implementations.document_service import DocumentService
 from src.models import (
     ChatRequest,
     ChatResponse,
     BatchUploadResponse,
     SummarizeRequest,
     SummaryResponse,
@@ -25,6 +27,9 @@ from config.config import settings
 app = FastAPI(title="RAG Chatbot API")
 # Initialize core components
 doc_processor = DocumentProcessor(
     chunk_size=1000,
@@ -32,10 +37,7 @@ doc_processor = DocumentProcessor(
     max_file_size=10 * 1024 * 1024
 )
 summarizer = ConversationSummarizer()
-document_service = DocumentService(doc_processor)
-# Initialize MongoDB
-mongodb = MongoDBStore(settings.MONGODB_URI)
 @app.post("/documents/upload", response_model=BatchUploadResponse)
 async def upload_documents(
@@ -57,6 +59,52 @@ async def upload_documents(
     finally:
         document_service.cleanup()
 @app.post("/chat", response_model=ChatResponse)
 async def chat_endpoint(
     request: ChatRequest,

 # Import custom modules
 from src.agents.rag_agent import RAGAgent
+from src.models.document import AllDocumentsResponse, StoredDocument
 from src.utils.document_processor import DocumentProcessor
 from src.utils.conversation_summarizer import ConversationSummarizer
 from src.utils.logger import logger
 from src.models import (
     ChatRequest,
     ChatResponse,
+    DocumentResponse,
     BatchUploadResponse,
     SummarizeRequest,
     SummaryResponse,
 app = FastAPI(title="RAG Chatbot API")
+# Initialize MongoDB
+mongodb = MongoDBStore(settings.MONGODB_URI)
 # Initialize core components
 doc_processor = DocumentProcessor(
     chunk_size=1000,
     max_file_size=10 * 1024 * 1024
 )
 summarizer = ConversationSummarizer()
+document_service = DocumentService(doc_processor, mongodb)
 @app.post("/documents/upload", response_model=BatchUploadResponse)
 async def upload_documents(
     finally:
         document_service.cleanup()
+@app.get("/documents", response_model=AllDocumentsResponse)
+async def get_all_documents(include_embeddings: bool = False):
+    """
+    Get all documents stored in the system
+    Args:
+        include_embeddings (bool): Whether to include embeddings in the response
+    """
+    try:
+        vector_store, _ = await get_vector_store()
+        documents = vector_store.get_all_documents(include_embeddings=include_embeddings)
+        return AllDocumentsResponse(
+            total_documents=len(documents),
+            documents=[
+                StoredDocument(
+                    id=doc['id'],
+                    text=doc['text'],
+                    embedding=doc.get('embedding'),
+                    metadata=doc.get('metadata')
+                ) for doc in documents
+            ]
+        )
+    except Exception as e:
+        logger.error(f"Error retrieving documents: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/documentchunks/{document_id}")
+async def get_document_chunks(document_id: str):
+    """Get all chunks for a specific document"""
+    try:
+        vector_store, _ = await get_vector_store()
+        chunks = vector_store.get_document_chunks(document_id)
+        if not chunks:
+            raise HTTPException(status_code=404, detail="Document not found")
+        return {
+            "document_id": document_id,
+            "total_chunks": len(chunks),
+            "chunks": chunks
+        }
+    except Exception as e:
+        logger.error(f"Error retrieving document chunks: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
 @app.post("/chat", response_model=ChatResponse)
 async def chat_endpoint(
     request: ChatRequest,

src/models/__pycache__/document.cpython-312.pyc CHANGED Viewed

Binary files a/src/models/__pycache__/document.cpython-312.pyc and b/src/models/__pycache__/document.cpython-312.pyc differ

src/models/document.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # src/models/document.py
 from pydantic import BaseModel
-from typing import Optional, List
 class DocumentInfo(BaseModel):
     """Document information model"""
@@ -19,4 +19,16 @@ class BatchUploadResponse(BaseModel):
     """Response model for batch document upload"""
     message: str
     processed_files: List[DocumentResponse]
-    failed_files: List[dict]

 # src/models/document.py
 from pydantic import BaseModel
+from typing import Optional, List, Dict, Any
 class DocumentInfo(BaseModel):
     """Document information model"""
     """Response model for batch document upload"""
     message: str
     processed_files: List[DocumentResponse]
+    failed_files: List[dict]
+class StoredDocument(BaseModel):
+    """Model for a document stored in the vector store"""
+    id: str
+    text: str
+    embedding: Optional[List[float]] = None
+    metadata: Optional[Dict[str, Any]] = None
+class AllDocumentsResponse(BaseModel):
+    """Response model for retrieving all documents"""
+    total_documents: int
+    documents: List[StoredDocument]

src/vectorstores/__pycache__/base_vectorstore.cpython-312.pyc CHANGED Viewed

Binary files a/src/vectorstores/__pycache__/base_vectorstore.cpython-312.pyc and b/src/vectorstores/__pycache__/base_vectorstore.cpython-312.pyc differ

src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc CHANGED Viewed

Binary files a/src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc and b/src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc differ

src/vectorstores/base_vectorstore.py CHANGED Viewed

@@ -1,20 +1,21 @@
 # src/vectorstores/base_vectorstore.py
 from abc import ABC, abstractmethod
-from typing import List, Callable, Any
 class BaseVectorStore(ABC):
     @abstractmethod
     def add_documents(
         self,
         documents: List[str],
-        embeddings: List[List[float]]
     ) -> None:
         """
         Add documents to the vector store
         Args:
             documents (List[str]): List of document texts
-            embeddings (List[List[float]]): Corresponding embeddings
         """
         pass
@@ -22,7 +23,8 @@ class BaseVectorStore(ABC):
     def similarity_search(
         self,
         query_embedding: List[float],
-        top_k: int = 3
     ) -> List[str]:
         """
         Perform similarity search
@@ -30,8 +32,25 @@ class BaseVectorStore(ABC):
         Args:
             query_embedding (List[float]): Embedding of the query
             top_k (int): Number of top similar documents to retrieve
         Returns:
             List[str]: List of most similar documents
         """
         pass

 # src/vectorstores/base_vectorstore.py
 from abc import ABC, abstractmethod
+from typing import List, Callable, Any, Dict, Optional
 class BaseVectorStore(ABC):
     @abstractmethod
     def add_documents(
         self,
         documents: List[str],
+        embeddings: Optional[List[List[float]]] = None
     ) -> None:
         """
         Add documents to the vector store
         Args:
             documents (List[str]): List of document texts
+            embeddings (Optional[List[List[float]]]): Corresponding embeddings.
+                If not provided, they will be generated using the embedding function.
         """
         pass
     def similarity_search(
         self,
         query_embedding: List[float],
+        top_k: int = 3,
+        **kwargs
     ) -> List[str]:
         """
         Perform similarity search
         Args:
             query_embedding (List[float]): Embedding of the query
             top_k (int): Number of top similar documents to retrieve
+            **kwargs: Additional search parameters
         Returns:
             List[str]: List of most similar documents
         """
+        pass
+    @abstractmethod
+    def get_all_documents(
+        self,
+        include_embeddings: bool = False
+    ) -> List[Dict[str, Any]]:
+        """
+        Retrieve all documents from the vector store
+        Args:
+            include_embeddings (bool): Whether to include embeddings in the response
+        Returns:
+            List[Dict[str, Any]]: List of documents with their IDs and optionally embeddings
+        """
         pass

src/vectorstores/chroma_vectorstore.py CHANGED Viewed

@@ -1,6 +1,8 @@
 # src/vectorstores/chroma_vectorstore.py
 import chromadb
-from typing import List, Callable, Any
 from .base_vectorstore import BaseVectorStore
@@ -8,7 +10,9 @@ class ChromaVectorStore(BaseVectorStore):
     def __init__(
         self,
         embedding_function: Callable[[List[str]], List[List[float]]],
-        persist_directory: str = './chroma_db'
     ):
         """
         Initialize Chroma Vector Store
@@ -16,39 +20,77 @@ class ChromaVectorStore(BaseVectorStore):
         Args:
             embedding_function (Callable): Function to generate embeddings
             persist_directory (str): Directory to persist the vector store
         """
-        self.client = chromadb.PersistentClient(path=persist_directory)
-        self.collection = self.client.get_or_create_collection(name="documents")
-        self.embedding_function = embedding_function
     def add_documents(
         self,
         documents: List[str],
-        embeddings: List[List[float]] = None
     ) -> None:
         """
         Add documents to the vector store
         Args:
             documents (List[str]): List of document texts
-            embeddings (List[List[float]], optional): Pre-computed embeddings
         """
-        if not embeddings:
-            embeddings = self.embedding_function(documents)
-        # Generate unique IDs
-        ids = [f"doc_{i}" for i in range(len(documents))]
-        self.collection.add(
-            documents=documents,
-            embeddings=embeddings,
-            ids=ids
-        )
     def similarity_search(
         self,
         query_embedding: List[float],
-        top_k: int = 3
     ) -> List[str]:
         """
         Perform similarity search
@@ -56,13 +98,114 @@ class ChromaVectorStore(BaseVectorStore):
         Args:
             query_embedding (List[float]): Embedding of the query
             top_k (int): Number of top similar documents to retrieve
         Returns:
             List[str]: List of most similar documents
         """
-        results = self.collection.query(
-            query_embeddings=[query_embedding],
-            n_results=top_k
-        )
-        return results.get('documents', [[]])[0]

 # src/vectorstores/chroma_vectorstore.py
 import chromadb
+from typing import List, Callable, Any, Dict, Optional
+from chromadb.config import Settings
+import logging
 from .base_vectorstore import BaseVectorStore
     def __init__(
         self,
         embedding_function: Callable[[List[str]], List[List[float]]],
+        persist_directory: str = './chroma_db',
+        collection_name: str = "documents",
+        client_settings: Optional[Dict[str, Any]] = None
     ):
         """
         Initialize Chroma Vector Store
         Args:
             embedding_function (Callable): Function to generate embeddings
             persist_directory (str): Directory to persist the vector store
+            collection_name (str): Name of the collection to use
+            client_settings (Optional[Dict[str, Any]]): Additional settings for ChromaDB client
         """
+        try:
+            settings = Settings(
+                persist_directory=persist_directory,
+                **(client_settings or {})
+            )
+            self.client = chromadb.PersistentClient(settings=settings)
+            self.collection = self.client.get_or_create_collection(
+                name=collection_name,
+                metadata={"hnsw:space": "cosine"}  # Using cosine similarity by default
+            )
+            self.embedding_function = embedding_function
+        except Exception as e:
+            logging.error(f"Error initializing ChromaDB: {str(e)}")
+            raise
     def add_documents(
         self,
         documents: List[str],
+        embeddings: Optional[List[List[float]]] = None,
+        metadatas: Optional[List[Dict[str, Any]]] = None,
+        ids: Optional[List[str]] = None
     ) -> None:
         """
         Add documents to the vector store
         Args:
             documents (List[str]): List of document texts
+            embeddings (Optional[List[List[float]]]): Pre-computed embeddings
+            metadatas (Optional[List[Dict[str, Any]]]): Metadata for each document
+            ids (Optional[List[str]]): Custom IDs for the documents
         """
+        try:
+            if not documents:
+                logging.warning("No documents provided to add_documents")
+                return
+            if not embeddings:
+                embeddings = self.embedding_function(documents)
+            if len(documents) != len(embeddings):
+                raise ValueError("Number of documents and embeddings must match")
+            # Use provided IDs or generate them
+            doc_ids = ids if ids is not None else [f"doc_{i}" for i in range(len(documents))]
+            # Prepare add parameters
+            add_params = {
+                "documents": documents,
+                "embeddings": embeddings,
+                "ids": doc_ids
+            }
+            # Only include metadatas if provided
+            if metadatas is not None:
+                if len(metadatas) != len(documents):
+                    raise ValueError("Number of documents and metadatas must match")
+                add_params["metadatas"] = metadatas
+            self.collection.add(**add_params)
+        except Exception as e:
+            logging.error(f"Error adding documents to ChromaDB: {str(e)}")
+            raise
     def similarity_search(
         self,
         query_embedding: List[float],
+        top_k: int = 3,
+        **kwargs
     ) -> List[str]:
         """
         Perform similarity search
         Args:
             query_embedding (List[float]): Embedding of the query
             top_k (int): Number of top similar documents to retrieve
+            **kwargs: Additional search parameters
         Returns:
             List[str]: List of most similar documents
         """
+        try:
+            results = self.collection.query(
+                query_embeddings=[query_embedding],
+                n_results=top_k,
+                **kwargs
+            )
+            # Handle the case where no results are found
+            if not results or 'documents' not in results:
+                return []
+            return results.get('documents', [[]])[0]
+        except Exception as e:
+            logging.error(f"Error performing similarity search in ChromaDB: {str(e)}")
+            raise
+    def get_all_documents(
+        self,
+        include_embeddings: bool = False
+    ) -> List[Dict[str, Any]]:
+        """
+        Retrieve all documents from the vector store
+        """
+        try:
+            include = ["documents", "metadatas"]
+            if include_embeddings:
+                include.append("embeddings")
+            results = self.collection.get(
+                include=include
+            )
+            if not results or 'documents' not in results:
+                return []
+            documents = []
+            for i in range(len(results['documents'])):
+                doc = {
+                    'id': str(i),  # Generate sequential IDs
+                    'text': results['documents'][i],
+                }
+                if include_embeddings and 'embeddings' in results:
+                    doc['embedding'] = results['embeddings'][i]
+                if 'metadatas' in results and results['metadatas'][i]:
+                    doc['metadata'] = results['metadatas'][i]
+                    # Use document_id from metadata if available
+                    if 'document_id' in results['metadatas'][i]:
+                        doc['id'] = results['metadatas'][i]['document_id']
+                documents.append(doc)
+            return documents
+        except Exception as e:
+            logging.error(f"Error retrieving documents from ChromaDB: {str(e)}")
+            raise
+    def get_document_chunks(self, document_id: str) -> List[Dict[str, Any]]:
+        """Retrieve all chunks for a specific document"""
+        try:
+            results = self.collection.get(
+                where={"document_id": document_id},
+                include=["documents", "metadatas"]
+            )
+            if not results or 'documents' not in results:
+                return []
+            chunks = []
+            for i in range(len(results['documents'])):
+                chunk = {
+                    'text': results['documents'][i],
+                    'metadata': results['metadatas'][i] if results.get('metadatas') else None
+                }
+                chunks.append(chunk)
+            # Sort by chunk_index if available
+            chunks.sort(key=lambda x: x.get('metadata', {}).get('chunk_index', 0))
+            return chunks
+        except Exception as e:
+            logging.error(f"Error retrieving document chunks: {str(e)}")
+            raise
+    def delete_document(self, document_id: str) -> None:
+        """Delete all chunks associated with a document_id"""
+        try:
+            # Get all chunks with the given document_id
+            results = self.collection.get(
+                where={"document_id": document_id},
+                include=["metadatas"]
+            )
+            if not results or 'ids' not in results:
+                logging.warning(f"No document found with ID: {document_id}")
+                return
+            # Delete all chunks associated with the document
+            chunk_ids = [f"{document_id}-chunk-{i}" for i in range(len(results['metadatas']))]
+            self.collection.delete(ids=chunk_ids)
+        except Exception as e:
+            logging.error(f"Error deleting document {document_id} from ChromaDB: {str(e)}")
+            raise