Spaces:

TalatMasud
/

chatbot-backend

Sleeping

App Files Files Community

TalatMasood commited on Feb 22

Commit

415595f

1 Parent(s): f36ab64

Updarte chatbot with deployment configurations on the Render

Browse files

Files changed (27) hide show

.gitignore +49 -0
DockerComposeConfiguration +0 -33
Dockerfile +0 -25
config/__pycache__/config.cpython-312.pyc +0 -0
config/config.py +49 -31
render.yaml +25 -0
requirements.txt +38 -23
runtime.txt +1 -0
setup.py +0 -53
src/__pycache__/main.cpython-312.pyc +0 -0
src/agents/__pycache__/enhanced_context_manager.cpython-312.pyc +0 -0
src/agents/__pycache__/rag_agent.cpython-312.pyc +0 -0
src/agents/__pycache__/rag_agent_manager.cpython-312.pyc +0 -0
src/agents/__pycache__/system_instructions_rag.cpython-312.pyc +0 -0
src/agents/enhanced_context_manager.py +0 -202
src/agents/rag_agent.py +144 -247
src/agents/rag_agent_manager.py +0 -77
src/agents/system_instructions_rag.py +177 -338
src/implementations/__pycache__/document_service.cpython-312.pyc +0 -0
src/main.py +258 -138
src/utils/__pycache__/database_cleanup.cpython-312.pyc +0 -0
src/utils/__pycache__/document_processor.cpython-312.pyc +0 -0
src/utils/database_cleanup.py +144 -91
src/utils/document_processor.py +365 -64
src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc +0 -0
src/vectorstores/chroma_vectorstore.py +75 -61
temp_downloads/1K608-Qr03M6nf5FhB6AajbHm8kjQujx1.xlsx +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,49 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual Environment
+venv/
+ENV/
+# Environment Variables
+.env
+.env.local
+.env.*.local
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+# Logs
+*.log
+# Database
+chroma_db/
+uploads/
+temp_downloads/
+# OS
+.DS_Store
+Thumbs.db

DockerComposeConfiguration DELETED Viewed

@@ -1,33 +0,0 @@
-version: '3.8'
-services:
-  app:
-    build: .
-    ports:
-      - "8000:8000"
-    env_file:
-      - .env
-    volumes:
-      - ./:/app
-    depends_on:
-      - ollama
-  ollama:
-    image: ollama/ollama
-    ports:
-      - "11434:11434"
-    volumes:
-      - ollama-data:/root/.ollama
-  chroma:
-    image: chromadb/chroma
-    ports:
-      - "8000:8000"
-    volumes:
-      - chroma-data:/chroma
-    environment:
-      - PERSIST_DIRECTORY=/chroma
-volumes:
-  ollama-data:
-  chroma-data:

Dockerfile DELETED Viewed

@@ -1,25 +0,0 @@
-# Use an official Python runtime as a parent image
-FROM python:3.9-slim
-# Set the working directory in the container
-WORKDIR /app
-# Install system dependencies
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    && rm -rf /var/lib/apt/lists/*
-# Copy the current directory contents into the container at /app
-COPY . /app
-# Install any needed packages specified in requirements.txt
-RUN pip install --no-cache-dir -r requirements.txt
-# Make port 8000 available to the world outside this container
-EXPOSE 8000
-# Define environment variable
-ENV NAME RAGChatbot
-# Run the application
-CMD ["uvicorn", "src.main:app", "--host", "0.0.0.0", "--port", "8000"]

config/__pycache__/config.cpython-312.pyc CHANGED Viewed

Binary files a/config/__pycache__/config.cpython-312.pyc and b/config/__pycache__/config.cpython-312.pyc differ

config/config.py CHANGED Viewed

@@ -7,32 +7,33 @@ from google_auth_oauthlib.flow import Flow
 # Load environment variables
 load_dotenv()
 class Settings:
     # OpenAI Configuration
     OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', '')
     OPENAI_MODEL = os.getenv('OPENAI_MODEL', 'gpt-3.5-turbo')
     ADMIN_API_KEY = 'aca4081f-6ff2-434c-843b-98f60285c499'
     # Ollama Configuration
     OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')
     OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'llama2')
     # Anthropic Configuration
     ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY', '')
     # Embedding Configuration
     EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL', 'all-MiniLM-L6-v2')
     # Vector Store Configuration
     CHROMA_PATH = os.getenv('CHROMA_PATH', './chroma_db')
     # MongoDB Configuration
     MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017')
     # Feedback Configuration
     MAX_RATING = int(os.getenv('MAX_RATING', '5'))
     # Temporary directory for downloaded files
     TEMP_DOWNLOAD_DIR = os.getenv('TEMP_DOWNLOAD_DIR', './temp_downloads')
@@ -40,27 +41,44 @@ class Settings:
     DEBUG = os.getenv('DEBUG', 'False') == 'True'
     # Google Drive Configuration
-    GOOGLE_DRIVE_FOLDER_ID=os.getenv('GOOGLE_DRIVE_FOLDER_ID', '')
-    GOOGLE_SERVICE_ACCOUNT_PATH = os.getenv('GOOGLE_SERVICE_ACCOUNT_PATH', 'service_account.json')
-    # GOOGLE_DRIVE_FOLDER_ID = os.getenv('GOOGLE_DRIVE_FOLDER_ID', '')
-    # GOOGLE_OAUTH_CLIENT_ID = os.getenv('GOOGLE_OAUTH_CLIENT_ID', '')
-    # GOOGLE_OAUTH_CLIENT_SECRET = os.getenv('GOOGLE_OAUTH_CLIENT_SECRET', '')
-    # GOOGLE_OAUTH_REDIRECT_URI = os.getenv('GOOGLE_OAUTH_REDIRECT_URI', 'http://127.0.0.1:8000/google/oauth2callback')
-    # @property
-    # def google_oauth_flow(self):
-    #     flow = Flow.from_client_config({
-    #         "web": {
-    #             "client_id": self.GOOGLE_OAUTH_CLIENT_ID,
-    #             "client_secret": self.GOOGLE_OAUTH_CLIENT_SECRET,
-    #             "auth_uri": "https://accounts.google.com/o/oauth2/auth",
-    #             "token_uri": "https://oauth2.googleapis.com/token",
-    #             "redirect_uris": [self.GOOGLE_OAUTH_REDIRECT_URI],
-    #             "javascript_origins": ["http://localhost:8000", "http://127.0.0.1:8000"]
-    #         }
-    #     }, scopes=['https://www.googleapis.com/auth/drive.readonly'])
-    #     flow.redirect_uri = self.GOOGLE_OAUTH_REDIRECT_URI
-    #     return flow
-settings = Settings()

 # Load environment variables
 load_dotenv()
 class Settings:
     # OpenAI Configuration
     OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', '')
     OPENAI_MODEL = os.getenv('OPENAI_MODEL', 'gpt-3.5-turbo')
     ADMIN_API_KEY = 'aca4081f-6ff2-434c-843b-98f60285c499'
     # Ollama Configuration
     OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')
     OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'llama2')
     # Anthropic Configuration
     ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY', '')
     # Embedding Configuration
     EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL', 'all-MiniLM-L6-v2')
     # Vector Store Configuration
     CHROMA_PATH = os.getenv('CHROMA_PATH', './chroma_db')
     # MongoDB Configuration
     MONGODB_URI = os.getenv('MONGODB_URI', 'mongodb://localhost:27017')
     # Feedback Configuration
     MAX_RATING = int(os.getenv('MAX_RATING', '5'))
     # Temporary directory for downloaded files
     TEMP_DOWNLOAD_DIR = os.getenv('TEMP_DOWNLOAD_DIR', './temp_downloads')
     DEBUG = os.getenv('DEBUG', 'False') == 'True'
     # Google Drive Configuration
+    GOOGLE_DRIVE_FOLDER_ID = os.getenv('GOOGLE_DRIVE_FOLDER_ID', '')
+    GOOGLE_SERVICE_ACCOUNT_PATH = os.getenv(
+        'GOOGLE_SERVICE_ACCOUNT_PATH', 'service_account.json')
+    # Use explicit type conversion to ensure correct types
+    DOCUMENT_PROCESSOR = {
+        'chunk_size': int(os.getenv('DOCUMENT_CHUNK_SIZE', '1000')),
+        'chunk_overlap': int(os.getenv('DOCUMENT_CHUNK_OVERLAP', '200')),
+        # 20MB in bytes
+        'max_file_size': int(os.getenv('DOCUMENT_MAX_FILE_SIZE', str(20 * 1024 * 1024))),
+        'supported_formats': [
+            '.txt', '.pdf', '.docx', '.csv', '.json',
+            '.html', '.md', '.xml', '.rtf', '.xlsx', '.xls'
+        ]
+    }
+    @classmethod
+    def get_document_processor_settings(cls) -> dict:
+        """
+        Get document processor settings with validation
+        Returns:
+            dict: Validated document processor settings
+        """
+        settings = cls.DOCUMENT_PROCESSOR.copy()
+        # Ensure positive values for numeric settings
+        settings['chunk_size'] = max(
+            100, settings['chunk_size'])  # Minimum 100
+        settings['chunk_overlap'] = min(
+            settings['chunk_overlap'],
+            # Ensure overlap is less than chunk size
+            settings['chunk_size'] - 50
+        )
+        settings['max_file_size'] = max(
+            1024 * 1024, settings['max_file_size'])  # Minimum 1MB
+        return settings
+settings = Settings()

render.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+services:
+  - type: web
+    name: chatbot-backend
+    env: python
+    region: ohio  # Choose appropriate region
+    plan: starter  # Or choose appropriate plan
+    buildCommand: pip install -r requirements.txt
+    startCommand: uvicorn src.main:app --host 0.0.0.0 --port $PORT
+    envVars:
+      - key: MONGODB_URI
+        sync: false
+      - key: OPENAI_API_KEY
+        sync: false
+      - key: ANTHROPIC_API_KEY
+        sync: false
+      - key: ADMIN_API_KEY
+        sync: false
+      - key: CHROMA_PATH
+        value: ./chroma_db
+      - key: DEBUG
+        value: "False"
+      - key: ENVIRONMENT
+        value: "production"
+    healthCheckPath: /health
+    autoDeploy: true

requirements.txt CHANGED Viewed

@@ -1,23 +1,38 @@
-# Requirements for RAG Chatbot
-fastapi==0.109.0
-uvicorn==0.24.0
-pydantic==2.6.1
-python-dotenv==1.0.0
-# LLM Providers
-openai==1.12.0
-anthropic==0.18.0
-ollama==0.1.6
-# Embedding and Vector Store
-sentence-transformers==2.3.1
-chromadb==0.4.22
-huggingface_hub==0.20.3
-# Optional: Additional dependencies
-numpy==1.26.3
-torch==2.1.2
-PyPDF2==3.0.1
-python-docx==1.0.1
-requests==2.31.0

+fastapi
+uvicorn
+torch
+transformers
+openai
+anthropic
+sentence-transformers
+accelerate
+bitsandbytes
+pydantic
+email-validator
+numpy
+pandas
+scipy
+scikit-learn
+pymongo
+motor
+chromadb
+aiosqlite
+python-dotenv
+box
+PyPDF2
+python-docx
+python-magic-bin==0.4.14
+openpyxl
+xlrd
+striprtf
+beautifulsoup4
+pydrive2==1.14.0
+google-auth-oauthlib==0.4.6
+requests
+tqdm
+matplotlib
+plotly
+tiktoken
+psutil
+huggingface_hub
+setuptools

runtime.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ python-3.12

setup.py DELETED Viewed

@@ -1,53 +0,0 @@
-from setuptools import setup, find_packages
-setup(
-    name="chatbot",
-    version="1.0.0",
-    packages=find_packages(),
-    install_requires=[
-        # Web Framework
-        "fastapi",
-        "uvicorn",
-        # AI/ML
-        "torch",
-        "transformers",
-        "sentence-transformers",
-        "huggingface_hub",
-        # LLM Providers
-        "openai",
-        "anthropic",
-        "ollama",
-        # Data Validation & Processing
-        "pydantic",
-        "email-validator",
-        "numpy",
-        "pandas",
-        # Database & Storage
-        "pymongo",
-        "motor",
-        "chromadb",
-        "aiosqlite",
-        # Document Processing
-        "PyPDF2",
-        "python-docx",
-        "python-magic-bin==0.4.14",
-        "openpyxl",
-        "xlrd",
-        "striprtf",
-        "beautifulsoup4",
-        # Utilities
-        "python-dotenv",
-        "requests",
-        "tiktoken",
-        "psutil",
-        # Google Integration
-        "google-auth-oauthlib==0.4.6"
-    ]
-)

src/__pycache__/main.cpython-312.pyc CHANGED Viewed

Binary files a/src/__pycache__/main.cpython-312.pyc and b/src/__pycache__/main.cpython-312.pyc differ

src/agents/__pycache__/enhanced_context_manager.cpython-312.pyc CHANGED Viewed

Binary files a/src/agents/__pycache__/enhanced_context_manager.cpython-312.pyc and b/src/agents/__pycache__/enhanced_context_manager.cpython-312.pyc differ

src/agents/__pycache__/rag_agent.cpython-312.pyc CHANGED Viewed

Binary files a/src/agents/__pycache__/rag_agent.cpython-312.pyc and b/src/agents/__pycache__/rag_agent.cpython-312.pyc differ

src/agents/__pycache__/rag_agent_manager.cpython-312.pyc CHANGED Viewed

Binary files a/src/agents/__pycache__/rag_agent_manager.cpython-312.pyc and b/src/agents/__pycache__/rag_agent_manager.cpython-312.pyc differ

src/agents/__pycache__/system_instructions_rag.cpython-312.pyc CHANGED Viewed

Binary files a/src/agents/__pycache__/system_instructions_rag.cpython-312.pyc and b/src/agents/__pycache__/system_instructions_rag.cpython-312.pyc differ

src/agents/enhanced_context_manager.py DELETED Viewed

@@ -1,202 +0,0 @@
-from typing import List, Dict, Optional, Tuple
-import spacy
-from collections import defaultdict
-class EnhancedContextManager:
-    def __init__(self):
-        """Initialize the context manager with NLP components"""
-        # Load spaCy model for NER and dependency parsing
-        self.nlp = spacy.load("en_core_web_sm")
-        # Track entities and their mentions across conversation
-        self.entity_mentions = defaultdict(list)
-        # Track conversation turns
-        self.conversation_turns = []
-        # Track last processed entity
-        self.last_entity = None
-        # Track last full response context
-        self.last_full_context = None
-    def process_turn(self, query: str, response: str) -> None:
-        """Process a conversation turn to extract and track entities"""
-        # Parse query and response
-        query_doc = self.nlp(query)
-        response_doc = self.nlp(response)
-        # Extract and track entities from both query and response
-        turn_entities = self._extract_entities(query_doc, response_doc)
-        # Store the turn with its entities
-        self.conversation_turns.append({
-            'query': query,
-            'response': response,
-            'entities': turn_entities
-        })
-        # Update entity mentions
-        for entity, info in turn_entities.items():
-            self.entity_mentions[entity].append({
-                'turn_index': len(self.conversation_turns) - 1,
-                'info': info
-            })
-        # Update last entity and full context
-        if turn_entities:
-            # Prioritize entities in response, then query
-            primary_entity = (
-                list(turn_entities.keys())[0] if turn_entities
-                else None
-            )
-            self.last_entity = primary_entity
-        # Store full context for potential reference
-        self.last_full_context = f"{query} {response}"
-    def _extract_entities(self, query_doc, response_doc) -> Dict:
-        """Extract named entities and their properties"""
-        entities = {}
-        # Process both query and response documents
-        for doc in [query_doc, response_doc]:
-            for ent in doc.ents:
-                # Store entity with its type and text
-                entities[ent.text] = {
-                    'type': ent.label_,
-                    'text': ent.text,
-                    'mentions': [tok.text for tok in doc if tok.head == ent.root]
-                }
-        return entities
-    def resolve_pronouns(self, current_query: str) -> Optional[str]:
-        """
-        Resolve pronouns in the current query based on conversation history
-        Args:
-            current_query (str): Current query with potential pronouns
-        Returns:
-            Optional[str]: Query with resolved pronouns, or None if no resolution needed
-        """
-        if not self.conversation_turns:
-            return None
-        query_doc = self.nlp(current_query)
-        # Find pronouns in current query
-        pronouns = [token for token in query_doc if token.pos_ == "PRON"]
-        if not pronouns:
-            return None
-        for pronoun in pronouns:
-            replacement = self._find_antecedent(pronoun.text)
-            if replacement:
-                # Replace the pronoun with the most likely antecedent
-                new_query = current_query.replace(pronoun.text, replacement)
-                return new_query
-        return None
-    def _find_antecedent(self, pronoun: str) -> Optional[str]:
-        """
-        Find the most recent matching entity for a pronoun
-        Args:
-            pronoun (str): Pronoun to resolve
-        Returns:
-            Optional[str]: Resolved entity or None
-        """
-        # Pronoun to gender/number mapping
-        pronoun_properties = {
-            'he': {'gender': 'male', 'number': 'singular'},
-            'she': {'gender': 'female', 'number': 'singular'},
-            'they': {'gender': None, 'number': 'plural'},
-            'his': {'gender': 'male', 'number': 'singular'},
-            'her': {'gender': 'female', 'number': 'singular'},
-            'their': {'gender': None, 'number': 'plural'}
-        }
-        # Normalize pronoun
-        pronoun_lower = pronoun.lower().rstrip('s')
-        # If not a known pronoun, return None
-        if pronoun_lower not in pronoun_properties:
-            return None
-        # If a named entity was recently mentioned, use it first
-        if self.last_entity:
-            return self.last_entity
-        # Fallback to last full context if no specific entity found
-        if self.last_full_context:
-            return self.last_full_context.split()[0]
-        return None
-    def enhance_query(self, current_query: str) -> str:
-        """
-        Enhance current query with context and resolved pronouns
-        Args:
-            current_query (str): Original query
-        Returns:
-            str: Enhanced query with additional context
-        """
-        # First try to resolve pronouns
-        resolved_query = self.resolve_pronouns(current_query)
-        # If pronouns are resolved, use the resolved query
-        if resolved_query:
-            return resolved_query
-        # Get relevant context
-        context = self._get_relevant_context(current_query)
-        # If context found, prepend it to the query
-        if context:
-            return f"{context} {current_query}"
-        # If no context resolution, return original query
-        return current_query
-    def _get_relevant_context(self, query: str) -> Optional[str]:
-        """
-        Get relevant context from conversation history
-        Args:
-            query (str): Current query
-        Returns:
-            Optional[str]: Relevant context or None
-        """
-        if not self.conversation_turns:
-            return None
-        # Get the most recent turn
-        recent_turn = self.conversation_turns[-1]
-        # If the current query contains a pronoun and we have last full context
-        if any(token.pos_ == "PRON" for token in self.nlp(query)):
-            return self.last_full_context
-        return None
-    def get_conversation_context(self) -> List[Dict]:
-        """Get processed conversation context"""
-        return self.conversation_turns
-    def record_last_context(self, last_context: Optional[str] = None) -> None:
-        """
-        Manually record last context if needed
-        Args:
-            last_context (Optional[str]): Last context to manually set
-        """
-        if last_context:
-            self.last_full_context = last_context
-            # Try to extract an entity from the context
-            doc = self.nlp(last_context)
-            entities = [ent.text for ent in doc.ents]
-            if entities:
-                self.last_entity = entities[0]

src/agents/rag_agent.py CHANGED Viewed

@@ -1,8 +1,8 @@
-from typing import List, Dict, Optional, Tuple
 import uuid
 from .excel_aware_rag import ExcelAwareRAGAgent
-from .enhanced_context_manager import EnhancedContextManager
 from ..llms.base_llm import BaseLLM
 from src.embeddings.base_embedding import BaseEmbedding
 from src.vectorstores.base_vectorstore import BaseVectorStore
@@ -11,6 +11,7 @@ from src.db.mongodb_store import MongoDBStore
 from src.models.rag import RAGResponse
 from src.utils.logger import logger
 class RAGAgent(ExcelAwareRAGAgent):
     def __init__(
         self,
@@ -21,7 +22,17 @@ class RAGAgent(ExcelAwareRAGAgent):
         max_history_tokens: int = 4000,
         max_history_messages: int = 10
     ):
-        """Initialize RAG Agent with enhanced context management"""
         super().__init__()  # Initialize ExcelAwareRAGAgent
         self.llm = llm
         self.embedding = embedding
@@ -31,9 +42,6 @@ class RAGAgent(ExcelAwareRAGAgent):
             max_tokens=max_history_tokens,
             max_messages=max_history_messages
         )
-        # Add enhanced context management while preserving existing functionality
-        self.context_manager = EnhancedContextManager()
-        logger.info("RAGAgent initialized with enhanced context management")
     async def generate_response(
         self,
@@ -41,46 +49,19 @@ class RAGAgent(ExcelAwareRAGAgent):
         conversation_id: Optional[str],
         temperature: float,
         max_tokens: Optional[int] = None,
-        context_docs: Optional[List[str]] = None,
-        stream: bool = False,
-        custom_roles: Optional[List[Dict[str, str]]] = None
     ) -> RAGResponse:
-        """
-        Generate a response with comprehensive context and role management
-        Args:
-            query (str): User query
-            conversation_id (Optional[str]): Conversation identifier
-            temperature (float): LLM temperature for response generation
-            max_tokens (Optional[int]): Maximum tokens for response
-            context_docs (Optional[List[str]]): Pre-retrieved context documents
-            stream (bool): Whether to stream the response
-            custom_roles (Optional[List[Dict[str, str]]]): Custom role instructions
-        Returns:
-            RAGResponse: Generated response with context and metadata
-        """
         try:
-            logger.info(f"Generating response for query: {query}")
-            # Apply custom roles if provided
-            if custom_roles:
-                for role in custom_roles:
-                    # Modify query or context based on role
-                    if role.get('name') == 'introduction_specialist':
-                        query += " Provide a concise, welcoming response."
-                    elif role.get('name') == 'knowledge_based_specialist':
-                        query += " Ensure response is precise and directly from available knowledge."
-            # Introduction Handling
             is_introduction = (
-                "wants support" in query and
-                "This is Introduction" in query and
                 ("A new user with name:" in query or "An old user with name:" in query)
             )
             if is_introduction:
-                logger.info("Processing introduction message")
                 welcome_message = self._handle_contact_query(query)
                 return RAGResponse(
                     response=welcome_message,
@@ -89,118 +70,70 @@ class RAGAgent(ExcelAwareRAGAgent):
                     scores=None
                 )
-            # Conversation History Processing
             history = []
-            last_context = None
             if conversation_id:
-                logger.info(f"Retrieving conversation history for ID: {conversation_id}")
                 history = await self.mongodb.get_recent_messages(
                     conversation_id,
                     limit=self.conversation_manager.max_messages
                 )
-                # Process history for conversation manager
                 history = self.conversation_manager.get_relevant_history(
                     messages=history,
                     current_query=query
                 )
-                # Process in enhanced context manager
-                for msg in history:
-                    self.context_manager.process_turn(
-                        msg.get('query', ''),
-                        msg.get('response', '')
-                    )
-                # Get last context if available
-                if history and history[-1].get('response'):
-                    last_context = history[-1]['response']
-            # Query Enhancement
-            enhanced_query = self.context_manager.enhance_query(query)
-            # Manual Pronoun Handling Fallback
-            if enhanced_query == query:
-                pronoun_map = {
-                    'his': 'he',
-                    'her': 'she',
-                    'their': 'they'
-                }
-                words = query.lower().split()
-                for pronoun, replacement in pronoun_map.items():
-                    if pronoun in words:
-                        # Try to use last context
-                        if last_context:
-                            self.context_manager.record_last_context(last_context)
-                            enhanced_query = self.context_manager.enhance_query(query)
-                            break
-            logger.info(f"Enhanced query: {enhanced_query}")
-            # Context Retrieval
             if not context_docs:
-                logger.info("Retrieving context for enhanced query")
                 context_docs, sources, scores = await self.retrieve_context(
-                    enhanced_query,
                     conversation_history=history
                 )
             else:
-                sources = []
                 scores = None
-            # Context Fallback Mechanism
             if not context_docs:
-                # If no context and last context exists, use it
-                if last_context:
-                    context_docs = [last_context]
-                    sources = [{"source": "previous_context"}]
-                    scores = [1.0]
-                else:
-                    logger.info("No relevant context found")
-                    return RAGResponse(
-                        response="Information about this is not available, do you want to inquire about something else?",
-                        context_docs=[],
-                        sources=[],
-                        scores=None
-                    )
-            # Excel-specific Content Handling
             has_excel_content = any('Sheet:' in doc for doc in context_docs)
             if has_excel_content:
-                logger.info("Processing Excel-specific content")
                 try:
-                    context_docs = self._process_excel_context(context_docs, enhanced_query)
                 except Exception as e:
                     logger.warning(f"Error processing Excel context: {str(e)}")
-            # Prompt Generation with Conversation History
-            prompt = self.conversation_manager.generate_prompt_with_history(
-                current_query=enhanced_query,
                 history=history,
                 context_docs=context_docs
             )
-            # Streaming Response Generation
-            if stream:
-                # TODO: Implement actual streaming logic
-                # This is a placeholder and needs proper implementation
-                logger.warning("Streaming not fully implemented")
-            # Standard Response Generation
             response = self.llm.generate(
-                prompt=prompt,
                 temperature=temperature,
                 max_tokens=max_tokens
             )
-            # Response Cleaning
             cleaned_response = self._clean_response(response)
-            # Excel Response Enhancement
             if has_excel_content:
                 try:
                     enhanced_response = await self.enhance_excel_response(
-                        query=enhanced_query,
                         response=cleaned_response,
                         context_docs=context_docs
                     )
@@ -209,158 +142,122 @@ class RAGAgent(ExcelAwareRAGAgent):
                 except Exception as e:
                     logger.warning(f"Error enhancing Excel response: {str(e)}")
-            # Context Tracking
-            self.context_manager.process_turn(query, cleaned_response)
-            # Metadata Generation
-            metadata = {
-                'llm_provider': getattr(self.llm, 'model_name', 'unknown'),
-                'temperature': temperature,
-                'conversation_id': conversation_id,
-                'context_sources': sources,
-                'has_excel_content': has_excel_content
-            }
-            logger.info("Successfully generated response")
             return RAGResponse(
                 response=cleaned_response,
                 context_docs=context_docs,
                 sources=sources,
-                scores=scores,
-                metadata=metadata  # Added metadata
             )
         except Exception as e:
-            logger.error(f"Error in generate_response: {str(e)}")
             raise
     async def retrieve_context(
         self,
         query: str,
         conversation_history: Optional[List[Dict]] = None,
         top_k: int = 3
     ) -> Tuple[List[str], List[Dict], Optional[List[float]]]:
-        """Retrieve context with both original and enhanced handling"""
-        try:
-            logger.info(f"Retrieving context for query: {query}")
-            # Enhance query using both managers
-            if conversation_history:
-                # Get the last two messages for immediate context
-                recent_messages = conversation_history[-2:]
-                # Extract queries and responses for context
-                context_parts = []
-                for msg in recent_messages:
-                    if msg.get('query'):
-                        context_parts.append(msg['query'])
-                    if msg.get('response'):
-                        response = msg['response']
-                        if "Information about this is not available" not in response:
-                            context_parts.append(response)
-                # Combine with current query
-                enhanced_query = f"{' '.join(context_parts)} {query}".strip()
-                logger.info(f"Enhanced query with history: {enhanced_query}")
-            else:
-                enhanced_query = query
-            # Debug log the enhanced query
-            logger.info(f"Final enhanced query: {enhanced_query}")
-            # Embed the enhanced query
-            query_embedding = self.embedding.embed_query(enhanced_query)
-            # Debug log embedding shape
-            logger.info(f"Query embedding shape: {len(query_embedding)}")
-            # Retrieve similar documents
-            results = self.vector_store.similarity_search(
-                query_embedding,
-                top_k=top_k
-            )
-            # Debug log search results
-            logger.info(f"Number of search results: {len(results)}")
-            for i, result in enumerate(results):
-                logger.info(f"Result {i} score: {result.get('score', 'N/A')}")
-                logger.info(f"Result {i} text preview: {result.get('text', '')[:100]}...")
-            if not results:
-                logger.info("No results found in similarity search")
-                return [], [], None
-            # Process results
-            documents = [doc['text'] for doc in results]
-            sources = [self._convert_metadata_to_strings(doc['metadata'])
-                      for doc in results]
-            scores = [doc['score'] for doc in results
-                     if doc.get('score') is not None]
-            # Return scores only if available for all documents
-            if len(scores) != len(documents):
-                scores = None
-            logger.info(f"Retrieved {len(documents)} relevant documents")
-            return documents, sources, scores
-        except Exception as e:
-            logger.error(f"Error in retrieve_context: {str(e)}")
-            raise
-    def _clean_response(self, response: str) -> str:
-        """Clean response text while preserving key information"""
-        if not response:
-            return response
-        # Keep only the most common phrases to remove
-        phrases_to_remove = [
-            "Based on the context,",
-            "According to the documents,",
-            "From the information available,",
-            "Based on the provided information,",
-            "I apologize,"
-        ]
-        cleaned_response = response
-        for phrase in phrases_to_remove:
-            cleaned_response = cleaned_response.replace(phrase, "").strip()
-        cleaned_response = " ".join(cleaned_response.split())
-        if not cleaned_response:
-            return response
-        if cleaned_response[0].islower():
-            cleaned_response = cleaned_response[0].upper() + cleaned_response[1:]
-        return cleaned_response
     def _convert_metadata_to_strings(self, metadata: Dict) -> Dict:
-        """Convert metadata values to strings"""
-        try:
-            return {
-                key: str(value) if isinstance(value, (int, float)) else value
-                for key, value in metadata.items()
-            }
-        except Exception as e:
-            logger.error(f"Error converting metadata: {str(e)}")
-            return metadata
-    def _handle_contact_query(self, query: str) -> str:
-        """Handle contact/introduction queries"""
-        try:
-            name_start = query.find('name: "') + 7
-            name_end = query.find('"', name_start)
-            name = query[name_start:name_end] if name_start > 6 and name_end != -1 else "there"
-            is_returning = (
-                "An old user with name:" in query and
-                "wants support again" in query
-            )
-            return f"Welcome back {name}, How can I help you?" if is_returning else f"Welcome {name}, How can I help you?"
-        except Exception as e:
-            logger.error(f"Error handling contact query: {str(e)}")
-            return "Welcome, How can I help you?"

+# src/agents/rag_agent.py
+from typing import List, Optional, Tuple, Dict
 import uuid
 from .excel_aware_rag import ExcelAwareRAGAgent
 from ..llms.base_llm import BaseLLM
 from src.embeddings.base_embedding import BaseEmbedding
 from src.vectorstores.base_vectorstore import BaseVectorStore
 from src.models.rag import RAGResponse
 from src.utils.logger import logger
 class RAGAgent(ExcelAwareRAGAgent):
     def __init__(
         self,
         max_history_tokens: int = 4000,
         max_history_messages: int = 10
     ):
+        """
+        Initialize RAG Agent
+        Args:
+            llm (BaseLLM): Language model instance
+            embedding (BaseEmbedding): Embedding model instance
+            vector_store (BaseVectorStore): Vector store instance
+            mongodb (MongoDBStore): MongoDB store instance
+            max_history_tokens (int): Maximum tokens in conversation history
+            max_history_messages (int): Maximum messages to keep in history
+        """
         super().__init__()  # Initialize ExcelAwareRAGAgent
         self.llm = llm
         self.embedding = embedding
             max_tokens=max_history_tokens,
             max_messages=max_history_messages
         )
     async def generate_response(
         self,
         conversation_id: Optional[str],
         temperature: float,
         max_tokens: Optional[int] = None,
+        context_docs: Optional[List[str]] = None
     ) -> RAGResponse:
+        """Generate response with specific handling for different query types"""
         try:
+            # First, check if this is an introduction/welcome message query
             is_introduction = (
+                "wants support" in query and
+                "This is Introduction" in query and
                 ("A new user with name:" in query or "An old user with name:" in query)
             )
             if is_introduction:
+                # Handle introduction message - no context needed
                 welcome_message = self._handle_contact_query(query)
                 return RAGResponse(
                     response=welcome_message,
                     scores=None
                 )
+            # Get conversation history if conversation_id exists
             history = []
             if conversation_id:
                 history = await self.mongodb.get_recent_messages(
                     conversation_id,
                     limit=self.conversation_manager.max_messages
                 )
+                # Get relevant history within token limits
                 history = self.conversation_manager.get_relevant_history(
                     messages=history,
                     current_query=query
                 )
+            # Retrieve context if not provided
             if not context_docs:
                 context_docs, sources, scores = await self.retrieve_context(
+                    query=query,
                     conversation_history=history
                 )
             else:
+                sources = None
                 scores = None
+            # Check if we have any relevant context
             if not context_docs:
+                return RAGResponse(
+                    response="Information about this is not available, do you want to inquire about something else?",
+                    context_docs=[],
+                    sources=[],
+                    scores=None
+                )
+            # Check if this is an Excel-related query
             has_excel_content = any('Sheet:' in doc for doc in context_docs)
             if has_excel_content:
                 try:
+                    context_docs = self._process_excel_context(
+                        context_docs, query)
                 except Exception as e:
                     logger.warning(f"Error processing Excel context: {str(e)}")
+            # Generate prompt with context and history
+            augmented_prompt = self.conversation_manager.generate_prompt_with_history(
+                current_query=query,
                 history=history,
                 context_docs=context_docs
             )
+            # Generate initial response
             response = self.llm.generate(
+                prompt=augmented_prompt,
                 temperature=temperature,
                 max_tokens=max_tokens
             )
+            # Clean the response
             cleaned_response = self._clean_response(response)
+            # For Excel queries, enhance the response
             if has_excel_content:
                 try:
                     enhanced_response = await self.enhance_excel_response(
+                        query=query,
                         response=cleaned_response,
                         context_docs=context_docs
                     )
                 except Exception as e:
                     logger.warning(f"Error enhancing Excel response: {str(e)}")
+            # Return the final response
             return RAGResponse(
                 response=cleaned_response,
                 context_docs=context_docs,
                 sources=sources,
+                scores=scores
             )
         except Exception as e:
+            logger.error(f"Error in SystemInstructionsRAGAgent: {str(e)}")
             raise
+    def _create_response_prompt(self, query: str, context_docs: List[str]) -> str:
+        """
+        Create prompt for generating response from context
+        Args:
+            query (str): User query
+            context_docs (List[str]): Retrieved context documents
+        Returns:
+            str: Formatted prompt for the LLM
+        """
+        if not context_docs:
+            return f"Query: {query}\nResponse: Information about this is not available, do you want to inquire about something else?"
+        # Format context documents
+        formatted_context = "\n\n".join(
+            f"Context {i+1}:\n{doc.strip()}"
+            for i, doc in enumerate(context_docs)
+            if doc and doc.strip()
+        )
+        # Build the prompt with detailed instructions
+        prompt = f"""You are a knowledgeable assistant. Use the following context to answer the query accurately and informatively.
+    Context Information:
+    {formatted_context}
+    Query: {query}
+    Instructions:
+    1. Base your response ONLY on the information provided in the context above
+    2. If the context contains numbers, statistics, or specific details, include them in your response
+    3. Keep your response focused and relevant to the query
+    4. Use clear and professional language
+    5. If the context includes technical terms, explain them appropriately
+    6. Do not make assumptions or add information not present in the context
+    7. If specific sections of a report are mentioned, maintain their original structure
+    8. Format the response in a clear, readable manner
+    9. If the context includes chronological information, maintain the proper sequence
+    Response:"""
+        return prompt
     async def retrieve_context(
         self,
         query: str,
         conversation_history: Optional[List[Dict]] = None,
         top_k: int = 3
     ) -> Tuple[List[str], List[Dict], Optional[List[float]]]:
+        """
+        Retrieve context with conversation history enhancement
+        """
+        # Enhance query with conversation history
+        if conversation_history:
+            recent_queries = [
+                msg['query'] for msg in conversation_history[-2:]
+                if msg.get('query')
+            ]
+            enhanced_query = " ".join([*recent_queries, query])
+        else:
+            enhanced_query = query
+        # Debug log the enhanced query
+        logger.info(f"Enhanced query: {enhanced_query}")
+        # Embed the enhanced query
+        query_embedding = self.embedding.embed_query(enhanced_query)
+        # Debug log embedding shape
+        logger.info(f"Query embedding shape: {len(query_embedding)}")
+        # Retrieve similar documents
+        results = self.vector_store.similarity_search(
+            query_embedding,
+            top_k=top_k
+        )
+        # Debug log search results
+        logger.info(f"Number of search results: {len(results)}")
+        for i, result in enumerate(results):
+            logger.info(f"Result {i} score: {result.get('score', 'N/A')}")
+            logger.info(
+                f"Result {i} text preview: {result.get('text', '')[:100]}...")
+        # Process results
+        documents = [doc['text'] for doc in results]
+        sources = [self._convert_metadata_to_strings(doc['metadata'])
+                   for doc in results]
+        scores = [doc['score'] for doc in results
+                  if doc.get('score') is not None]
+        # Return scores only if available for all documents
+        if len(scores) != len(documents):
+            scores = None
+        return documents, sources, scores
     def _convert_metadata_to_strings(self, metadata: Dict) -> Dict:
+        """Convert numeric metadata values to strings"""
+        converted = {}
+        for key, value in metadata.items():
+            if isinstance(value, (int, float)):
+                converted[key] = str(value)
+            else:
+                converted[key] = value
+        return converted

src/agents/rag_agent_manager.py DELETED Viewed

@@ -1,77 +0,0 @@
-# src/agents/rag_agent_manager.py
-from typing import Optional
-import weakref
-from src.agents.system_instructions_rag import SystemInstructionsRAGAgent
-from src.llms.base_llm import BaseLLM
-from src.embeddings.base_embedding import BaseEmbedding
-from src.vectorstores.base_vectorstore import BaseVectorStore
-from src.db.mongodb_store import MongoDBStore
-from src.utils.logger import logger
-class RAGAgentManager:
-    """
-    Singleton manager for RAG Agent instances with intelligent caching
-    """
-    _instance = None
-    def __new__(cls):
-        if not cls._instance:
-            cls._instance = super().__new__(cls)
-        return cls._instance
-    def __init__(self):
-        # Ensure this is only initialized once
-        if not hasattr(self, '_initialized'):
-            self._rag_agent = None
-            self._initialized = True
-    def get_rag_agent(
-        self,
-        llm: BaseLLM,
-        embedding_model: BaseEmbedding,
-        vector_store: BaseVectorStore,
-        mongodb: MongoDBStore
-    ) -> SystemInstructionsRAGAgent:
-        """
-        Get or create a singleton RAG agent instance with intelligent caching
-        Args:
-            llm: Language Model instance
-            embedding_model: Embedding model instance
-            vector_store: Vector store instance
-            mongodb: MongoDB store instance
-        Returns:
-            SystemInstructionsRAGAgent: Singleton instance of the RAG agent
-        """
-        # If RAG agent exists and all dependencies are the same, return it
-        if self._rag_agent is not None:
-            logger.info("Reusing existing RAG agent instance")
-            return self._rag_agent
-        try:
-            logger.info("Creating new RAG agent instance")
-            # Create the agent
-            self._rag_agent = SystemInstructionsRAGAgent(
-                llm=llm,
-                embedding=embedding_model,
-                vector_store=vector_store,
-                mongodb=mongodb
-            )
-            return self._rag_agent
-        except Exception as e:
-            logger.error(f"Error creating RAG agent: {str(e)}")
-            raise
-    def reset_rag_agent(self):
-        """
-        Reset the RAG agent instance
-        """
-        logger.info("Resetting RAG agent instance")
-        self._rag_agent = None
-# Create a global instance for easy import
-rag_agent_manager = RAGAgentManager()

src/agents/system_instructions_rag.py CHANGED Viewed

@@ -1,34 +1,12 @@
 # src/agents/system_instructions_rag.py
-from typing import List, Dict, Optional, Tuple
-import spacy
-from src.agents.rag_agent import RAGAgent
-from src.llms.base_llm import BaseLLM
-from src.embeddings.base_embedding import BaseEmbedding
-from src.vectorstores.base_vectorstore import BaseVectorStore
-from src.db.mongodb_store import MongoDBStore
-from src.models.rag import RAGResponse
 from src.utils.logger import logger
 class SystemInstructionsRAGAgent(RAGAgent):
-    def __init__(
-        self,
-        llm: BaseLLM,
-        embedding: BaseEmbedding,
-        vector_store: BaseVectorStore,
-        mongodb: MongoDBStore,
-        max_history_tokens: int = 4000,
-        max_history_messages: int = 10
-    ):
-        """Initialize SystemInstructionsRAGAgent with enhanced context management"""
-        super().__init__(
-            llm=llm,
-            embedding=embedding,
-            vector_store=vector_store,
-            mongodb=mongodb,
-            max_history_tokens=max_history_tokens,
-            max_history_messages=max_history_messages
-        )
-        self.nlp = spacy.load("en_core_web_sm")
     async def generate_response(
         self,
@@ -36,18 +14,19 @@ class SystemInstructionsRAGAgent(RAGAgent):
         conversation_id: Optional[str] = None,
         temperature: float = 0.7,
         max_tokens: Optional[int] = None,
-        context_docs: Optional[List[str]] = None,
-        stream: bool = False
     ) -> RAGResponse:
-        """Generate response with guaranteed context handling"""
         try:
-            logger.info(f"Processing query: {query}")
-            # Store original context if provided
-            original_context = context_docs
-            # Handle introduction queries
-            if self._is_introduction_query(query):
                 welcome_message = self._handle_contact_query(query)
                 return RAGResponse(
                     response=welcome_message,
@@ -56,282 +35,200 @@ class SystemInstructionsRAGAgent(RAGAgent):
                     scores=None
                 )
-            # Get and process conversation history
-            history = []
             if conversation_id:
-                history = await self.mongodb.get_recent_messages(
-                    conversation_id,
-                    limit=self.conversation_manager.max_messages
-                )
-                # Process history in context manager
-                for msg in history:
-                    if msg.get('query') and msg.get('response'):
-                        self.context_manager.process_turn(msg['query'], msg['response'])
-            # Initialize context tracking
-            current_context = None
-            sources = []
-            scores = None
-            # Multi-stage context retrieval
-            if original_context:
-                current_context = original_context
-            else:
-                # Try with original query first
-                current_context, sources, scores = await self.retrieve_context(
                     query,
-                    conversation_history=history
                 )
-                # If no context, try with enhanced query
-                if not current_context:
-                    enhanced_query = self.context_manager.enhance_query(query)
-                    if enhanced_query != query:
-                        current_context, sources, scores = await self.retrieve_context(
-                            enhanced_query,
-                            conversation_history=history
-                        )
-                # If still no context, try history fallback
-                if not current_context:
-                    current_context, sources = self._get_context_from_history(history)
-                logger.info(f"Retrieved {len(current_context) if current_context else 0} context documents")
-            # Check context relevance
-            has_relevant_context = self._check_context_relevance(query, current_context or [])
-            logger.info(f"Context relevance check result: {has_relevant_context}")
-            # Handle no context case
             if not has_relevant_context:
-                return self._create_no_info_response()
-            # Generate response
-            prompt = self._create_response_prompt(query, current_context)
             response_text = self.llm.generate(
-                prompt=prompt,
                 temperature=temperature,
                 max_tokens=max_tokens
             )
-            # Process and validate response
             cleaned_response = self._clean_response(response_text)
             if self._is_no_info_response(cleaned_response):
-                return self._create_no_info_response()
-            # Update context tracking
-            self.context_manager.process_turn(query, cleaned_response)
-            # For Excel content, enhance the response
-            if any('Sheet:' in doc for doc in (current_context or [])):
-                try:
-                    cleaned_response = await self.enhance_excel_response(
-                        query=query,
-                        response=cleaned_response,
-                        context_docs=current_context
-                    )
-                except Exception as e:
-                    logger.warning(f"Error enhancing Excel response: {str(e)}")
             return RAGResponse(
                 response=cleaned_response,
-                context_docs=current_context,
                 sources=sources,
                 scores=scores
             )
         except Exception as e:
-            logger.error(f"Error in generate_response: {str(e)}")
             raise
-    def _convert_metadata_to_strings(self, metadata: Dict) -> Dict:
-        """Convert all metadata values to strings"""
-        return {
-            key: str(value) if value is not None else None
-            for key, value in metadata.items()
-        }
-    async def retrieve_context(
         self,
         query: str,
         conversation_history: Optional[List[Dict]] = None
-    ) -> Tuple[List[str], List[Dict], Optional[List[float]]]:
-        """Enhanced context retrieval with proper metadata type handling"""
-        try:
-            logger.info(f"Processing query for context retrieval: {query}")
-            collection_data = self.vector_store.collection.get()
-            if not collection_data or 'documents' not in collection_data:
-                logger.warning("No documents found in ChromaDB")
-                return [], [], None
-            documents = collection_data['documents']
-            metadatas = collection_data.get('metadatas', [])
-            # Clean and enhance query with date variations
-            clean_query = query.lower().strip()
-            # Extract and enhance date information
-            import re
-            from datetime import datetime
-            date_pattern = r'(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]* \d{1,2},? \d{4}'
-            dates = re.findall(date_pattern, clean_query.lower())
-            enhanced_query = clean_query
-            target_date = None
-            if dates:
-                try:
-                    date_obj = datetime.strptime(dates[0], '%b %d, %Y')
-                    target_date = date_obj.strftime('%b %d, %Y')
-                    date_variations = [
-                        date_obj.strftime('%B %d, %Y'),
-                        date_obj.strftime('%d/%m/%Y'),
-                        date_obj.strftime('%Y-%m-%d'),
-                        target_date
-                    ]
-                    enhanced_query = f"{clean_query} {' '.join(date_variations)}"
-                except ValueError as e:
-                    logger.warning(f"Error parsing date: {str(e)}")
-            # First try exact date matching
-            exact_matches = []
-            exact_metadata = []
-            if target_date:
-                for i, doc in enumerate(documents):
-                    if target_date in doc:
-                        logger.info(f"Found exact date match in document {i}")
-                        exact_matches.append(doc)
-                        if metadatas:
-                            # Convert metadata values to strings
-                            exact_metadata.append(self._convert_metadata_to_strings(metadatas[i]))
-                if exact_matches:
-                    logger.info(f"Found {len(exact_matches)} exact date matches")
-                    document_id = exact_metadata[0].get('document_id') if exact_metadata else None
-                    if document_id:
-                        all_related_chunks = []
-                        all_related_metadata = []
-                        all_related_scores = []
-                        for i, doc in enumerate(documents):
-                            if metadatas[i].get('document_id') == document_id:
-                                all_related_chunks.append(doc)
-                                # Convert metadata values to strings
-                                all_related_metadata.append(self._convert_metadata_to_strings(metadatas[i]))
-                                all_related_scores.append(1.0)
-                        # Sort chunks by their index
-                        sorted_results = sorted(
-                            zip(all_related_chunks, all_related_metadata, all_related_scores),
-                            key=lambda x: int(x[1].get('chunk_index', '0'))  # Convert to int for sorting
-                        )
-                        sorted_chunks, sorted_metadata, sorted_scores = zip(*sorted_results)
-                        logger.info(f"Returning {len(sorted_chunks)} chunks from document {document_id}")
-                        return list(sorted_chunks), list(sorted_metadata), list(sorted_scores)
-            # If no exact matches, use enhanced query for embedding search
-            logger.info("No exact matches found, using enhanced query for embedding search")
-            query_embedding = self.embedding.embed_query(enhanced_query)
-            results = self.vector_store.similarity_search(
-                query_embedding,
-                top_k=5
-            )
-            if not results:
-                logger.warning("No results found in similarity search")
-                return [], [], None
-            context_docs = []
-            sources = []
-            scores = []
-            sorted_results = sorted(results, key=lambda x: x.get('score', 0), reverse=True)
-            for result in sorted_results:
-                score = result.get('score', 0)
-                if score > 0.3:
-                    context_docs.append(result.get('text', ''))
-                    # Convert metadata values to strings
-                    sources.append(self._convert_metadata_to_strings(result.get('metadata', {})))
-                    scores.append(score)
-            if context_docs:
-                logger.info(f"Returning {len(context_docs)} documents from similarity search")
-                return context_docs, sources, scores
-            logger.warning("No relevant documents found")
-            return [], [], None
-        except Exception as e:
-            logger.error(f"Error in retrieve_context: {str(e)}")
-            logger.exception("Full traceback:")
-            return [], [], None
-    def _is_introduction_query(self, query: str) -> bool:
-        """Check if query is an introduction message"""
-        return (
-            "wants support" in query and
-            "This is Introduction" in query and
-            ("A new user with name:" in query or "An old user with name:" in query)
-        )
-    def _get_context_from_history(
-        self,
-        history: List[Dict]
-    ) -> Tuple[Optional[List[str]], Optional[List[Dict]]]:
-        """Extract context from conversation history"""
-        for msg in reversed(history):
-            if msg.get('context') and not self._is_no_info_response(msg.get('response', '')):
-                return msg['context'], msg.get('sources', [])
-        return None, None
-    def _create_response_prompt(self, query: str, context_docs: List[str]) -> str:
-        """Create prompt for response generation"""
         formatted_context = '\n\n'.join(
             f"Context {i+1}:\n{doc.strip()}"
             for i, doc in enumerate(context_docs)
             if doc and doc.strip()
         )
         return f"""
-Use ONLY the following context to provide information about: {query}
 {formatted_context}
 Instructions:
-1. Use ONLY information present in the context above
-2. If the information is found in the context, provide a direct and concise response
 3. Do not make assumptions or add information not present in the context
 4. Ensure the response is clear and complete based on available information
-5. If you cannot find relevant information about the specific query in the context,
    respond exactly with: "Information about this is not available, do you want to inquire about something else?"
 Query: {query}
 Response:"""
-    def _create_no_info_response(self) -> RAGResponse:
-        """Create standard response for no information case"""
-        return RAGResponse(
-            response="Information about this is not available, do you want to inquire about something else?",
-            context_docs=[],
-            sources=[],
-            scores=None
-        )
     def _clean_response(self, response: str) -> str:
-        """Clean response text"""
         if not response:
             return response
@@ -351,6 +248,7 @@ Response:"""
             "Here's what I found:",
             "Here's the information you requested:",
             "According to the provided information,",
             "The information suggests that",
             "From what I can see,",
             "Let me explain",
@@ -359,85 +257,26 @@ Response:"""
             "I can see that",
             "Sure,",
             "Well,",
             "I apologize,"
         ]
         cleaned_response = response
         for phrase in phrases_to_remove:
             cleaned_response = cleaned_response.replace(phrase, "").strip()
         cleaned_response = " ".join(cleaned_response.split())
         if not cleaned_response:
             return response
-        if cleaned_response[0].islower():
-            cleaned_response = cleaned_response[0].upper() + cleaned_response[1:]
-        return cleaned_response
-    def _is_no_info_response(self, response: str) -> bool:
-        """Check if response indicates no information available"""
-        no_info_indicators = [
-            "i do not have",
-            "i don't have",
-            "no information",
-            "not available",
-            "could not find",
-            "couldn't find",
-            "cannot find",
-            "don't know",
-            "do not know",
-            "unable to find",
-            "no data",
-            "no relevant"
-        ]
-        response_lower = response.lower()
-        return any(indicator in response_lower for indicator in no_info_indicators)
-    def _check_context_relevance(self, query: str, context_docs: List[str]) -> bool:
-        """Enhanced context relevance checking"""
-        if not context_docs:
-            return False
-        # Clean and prepare query
-        clean_query = query.lower().strip()
-        query_terms = set(word for word in clean_query.split()
-                         if word not in {'tell', 'me', 'about', 'what', 'is', 'the'})
-        for doc in context_docs:
-            if not doc:
-                continue
-            doc_lower = doc.lower()
-            # For CSV-like content, check each line
-            lines = doc_lower.split('\n')
-            for line in lines:
-                # Check if any query term appears in the line
-                if any(term in line for term in query_terms):
-                    return True
-            # Also check the whole document for good measure
-            if any(term in doc_lower for term in query_terms):
-                return True
-        return False
-    def _handle_contact_query(self, query: str) -> str:
-        """Handle contact/introduction queries"""
-        try:
-            name_start = query.find('name: "') + 7
-            name_end = query.find('"', name_start)
-            name = query[name_start:name_end] if name_start > 6 and name_end != -1 else "there"
-            is_returning = (
-                "An old user with name:" in query and
-                "wants support again" in query
-            )
-            return f"Welcome back {name}, How can I help you?" if is_returning else f"Welcome {name}, How can I help you?"
-        except Exception as e:
-            logger.error(f"Error handling contact query: {str(e)}")
-            return "Welcome, How can I help you?"

 # src/agents/system_instructions_rag.py
+from typing import List, Dict, Optional
+from src.agents.rag_agent import RAGResponse
 from src.utils.logger import logger
+from src.agents.rag_agent import RAGAgent
 class SystemInstructionsRAGAgent(RAGAgent):
+    """RAG Agent with enhanced system instructions for specific use cases"""
     async def generate_response(
         self,
         conversation_id: Optional[str] = None,
         temperature: float = 0.7,
         max_tokens: Optional[int] = None,
+        context_docs: Optional[List[str]] = None
     ) -> RAGResponse:
+        """Generate response with specific handling for introduction and no-context cases"""
         try:
+            # First, check if this is an introduction/welcome message query
+            is_introduction = (
+                "wants support" in query and
+                "This is Introduction" in query and
+                ("A new user with name:" in query or "An old user with name:" in query)
+            )
+            if is_introduction:
+                # Handle introduction message - no context needed
                 welcome_message = self._handle_contact_query(query)
                 return RAGResponse(
                     response=welcome_message,
                     scores=None
                 )
+            # Get conversation history if conversation_id exists
+            conversation_history = []
             if conversation_id:
+                try:
+                    conversation_history = await self.mongodb.get_recent_messages(
+                        conversation_id,
+                        limit=self.conversation_manager.max_messages
+                    )
+                    # Get relevant history within token limits
+                    conversation_history = self.conversation_manager.get_relevant_history(
+                        messages=conversation_history,
+                        current_query=query
+                    )
+                except Exception as e:
+                    logger.warning(
+                        f"Error fetching conversation history: {str(e)}")
+            # For all other queries, proceed with context-based response
+            if not context_docs:
+                context_docs, sources, scores = await self.retrieve_context(
                     query,
+                    conversation_history=conversation_history
                 )
+            # Check if we have relevant context
+            has_relevant_context = self._check_context_relevance(
+                query, context_docs or []
+            )
+            # If no relevant context found, return the standard message
             if not has_relevant_context:
+                return RAGResponse(
+                    response="Information about this is not available, do you want to inquire about something else?",
+                    context_docs=[],
+                    sources=[],
+                    scores=None
+                )
+            # Generate response using context and conversation history
+            prompt = self._create_response_prompt(
+                query=query,
+                context_docs=context_docs,
+                conversation_history=conversation_history
+            )
             response_text = self.llm.generate(
+                prompt,
                 temperature=temperature,
                 max_tokens=max_tokens
             )
+            # Check if the generated response indicates no information
             cleaned_response = self._clean_response(response_text)
             if self._is_no_info_response(cleaned_response):
+                return RAGResponse(
+                    response="Information about this is not available, do you want to inquire about something else?",
+                    context_docs=[],
+                    sources=[],
+                    scores=None
+                )
             return RAGResponse(
                 response=cleaned_response,
+                context_docs=context_docs,
                 sources=sources,
                 scores=scores
             )
         except Exception as e:
+            logger.error(f"Error in SystemInstructionsRAGAgent: {str(e)}")
             raise
+    def _create_response_prompt(
         self,
         query: str,
+        context_docs: List[str],
         conversation_history: Optional[List[Dict]] = None
+    ) -> str:
+        """Create prompt for generating response from context and conversation history"""
+        # Format context documents
         formatted_context = '\n\n'.join(
             f"Context {i+1}:\n{doc.strip()}"
             for i, doc in enumerate(context_docs)
             if doc and doc.strip()
         )
+        # Format conversation history if available
+        history_context = ""
+        if conversation_history:
+            history_messages = []
+            # Use last 3 messages for context
+            for msg in conversation_history[-3:]:
+                role = msg.get('role', 'unknown')
+                content = msg.get('content', '')
+                history_messages.append(f"{role.capitalize()}: {content}")
+            if history_messages:
+                history_context = "\nPrevious Conversation:\n" + \
+                    "\n".join(history_messages)
         return f"""
+Use the following context and conversation history to provide information about: {query}
+Context Information:
 {formatted_context}
+{history_context}
 Instructions:
+1. Use information from both the context and conversation history
+2. If the information is found, provide a direct and concise response
 3. Do not make assumptions or add information not present in the context
 4. Ensure the response is clear and complete based on available information
+5. If you cannot find relevant information about the specific query,
    respond exactly with: "Information about this is not available, do you want to inquire about something else?"
 Query: {query}
 Response:"""
+    def _is_no_info_response(self, response: str) -> bool:
+        """Check if the response indicates no information available"""
+        no_info_indicators = [
+            "i do not have",
+            "i don't have",
+            "no information",
+            "not available",
+            "could not find",
+            "couldn't find",
+            "cannot find"
+        ]
+        response_lower = response.lower()
+        return any(indicator in response_lower for indicator in no_info_indicators)
+    def _check_context_relevance(self, query: str, context_docs: List[str]) -> bool:
+        """Check if context contains information relevant to the query"""
+        if not context_docs:
+            return False
+        # Extract key terms from query (keeping important words)
+        query_words = query.lower().split()
+        stop_words = {'me', 'a', 'about', 'what', 'is',
+                      'are', 'the', 'in', 'how', 'why', 'when', 'where'}
+        # Remove only basic stop words, keep important terms like "report", "share", etc.
+        query_terms = {word for word in query_words if word not in stop_words}
+        # Add additional relevant terms that might appear in the content
+        related_terms = {
+            'comprehensive',
+            'report',
+            'overview',
+            'summary',
+            'details',
+            'information'
+        }
+        query_terms.update(
+            word for word in query_words if word in related_terms)
+        # Check each context document for relevance
+        for doc in context_docs:
+            if not doc:
+                continue
+            doc_lower = doc.lower()
+            # Consider document relevant if it contains any query terms
+            # or if it starts with common report headers
+            if any(term in doc_lower for term in query_terms) or \
+                    any(header in doc_lower for header in ['overview', 'comprehensive report', 'summary']):
+                return True
+        return False
+    def _handle_contact_query(self, query: str) -> str:
+        """Handle queries from /user/contact endpoint"""
+        try:
+            name_start = query.find('name: "') + 7
+            name_end = query.find('"', name_start)
+            name = query[name_start:name_end] if name_start > 6 and name_end != -1 else "there"
+            is_returning = (
+                "An old user with name:" in query and
+                "wants support again" in query
+            )
+            if is_returning:
+                return f"Welcome back {name}, How can I help you?"
+            return f"Welcome {name}, How can I help you?"
+        except Exception as e:
+            logger.error(f"Error handling contact query: {str(e)}")
+            return "Welcome, How can I help you?"
     def _clean_response(self, response: str) -> str:
+        """Clean response by removing unwanted phrases"""
         if not response:
             return response
             "Here's what I found:",
             "Here's the information you requested:",
             "According to the provided information,",
+            "Based on the documents,",
             "The information suggests that",
             "From what I can see,",
             "Let me explain",
             "I can see that",
             "Sure,",
             "Well,",
+            "Based on the given context,",
+            "The available information shows that",
+            "From the context provided,",
+            "The documentation mentions that",
+            "According to the context,",
+            "As shown in the context,",
             "I apologize,"
         ]
         cleaned_response = response
         for phrase in phrases_to_remove:
             cleaned_response = cleaned_response.replace(phrase, "").strip()
         cleaned_response = " ".join(cleaned_response.split())
         if not cleaned_response:
             return response
+        if cleaned_response[0].islower():
+            cleaned_response = cleaned_response[0].upper(
+            ) + cleaned_response[1:]
+        return cleaned_response

src/implementations/__pycache__/document_service.cpython-312.pyc CHANGED Viewed

Binary files a/src/implementations/__pycache__/document_service.cpython-312.pyc and b/src/implementations/__pycache__/document_service.cpython-312.pyc differ

src/main.py CHANGED Viewed

@@ -1,4 +1,31 @@
 # src/main.py
 from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks
 from fastapi.responses import StreamingResponse, FileResponse
 from fastapi.staticfiles import StaticFiles
@@ -9,53 +36,36 @@ from datetime import datetime
 from pathlib import Path
 import os
 import asyncio
-os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1'
-#os.environ["OAUTHLIB_RELAX_TOKEN_SCOPE"] = "1"
-from fastapi.responses import RedirectResponse
-from google.oauth2.credentials import Credentials
-from google_auth_oauthlib.flow import Flow
-from src.utils.google_drive_service import GoogleDriveService
 # Import custom modules1
-#from src.agents.rag_agent import RAGAgent
-from src.agents.system_instructions_rag import SystemInstructionsRAGAgent
-from src.agents.rag_agent_manager import rag_agent_manager
-from src.models.document import AllDocumentsResponse, StoredDocument
-from src.models.UserContact import UserContactRequest
-from src.utils.document_processor import DocumentProcessor
-from src.utils.drive_document_processor import DriveDocumentProcessor
-from src.utils.conversation_summarizer import ConversationSummarizer
-from src.utils.logger import logger
-from src.utils.llm_utils import get_llm_instance, get_vector_store
-from src.db.mongodb_store import MongoDBStore
-from src.implementations.document_service import DocumentService
-from src.models import (
-    ChatRequest,
-    ChatResponse,
-    BatchUploadResponse,
-    SummarizeRequest,
-    SummaryResponse,
-    FeedbackRequest
-)
-from fastapi import HTTPException, Depends
-from fastapi.security import APIKeyHeader
-from src.utils.database_cleanup import perform_cleanup
-from config.config import settings
 app = FastAPI(title="Chatbot API")
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["http://localhost:8080", "http://localhost:3000"],  # Add both ports
     allow_credentials=True,
     allow_methods=["*"],  # Allows all methods
     allow_headers=["*"],  # Allows all headers
 )
-#google_drive_service = GoogleDriveService()
 # Initialize MongoDB
 mongodb = MongoDBStore(settings.MONGODB_URI)
@@ -75,6 +85,7 @@ app.mount("/docs", StaticFiles(directory=str(UPLOADS_DIR)), name="documents")
 # Security setup
 API_KEY_HEADER = APIKeyHeader(name="ADMIN_API_KEY")
 async def verify_api_key(api_key: str = Depends(API_KEY_HEADER)):
     """Verify admin API key"""
     if not settings.ADMIN_API_KEY or api_key != settings.ADMIN_API_KEY:
@@ -84,41 +95,16 @@ async def verify_api_key(api_key: str = Depends(API_KEY_HEADER)):
         )
     return api_key
-# @app.get("/google/auth")
-# async def google_auth():
-#     authorization_url, _ = settings.google_oauth_flow.authorization_url(
-#         access_type='offline',
-#         prompt='consent',
-#         include_granted_scopes='true'
-#     )
-#     return RedirectResponse(authorization_url)
-# @app.get("/google/oauth2callback")
-# async def google_auth_callback(code: str):
-#     flow = Flow.from_client_config({
-#         "web": {
-#             "client_id": settings.GOOGLE_OAUTH_CLIENT_ID,
-#             "client_secret": settings.GOOGLE_OAUTH_CLIENT_SECRET,
-#             "auth_uri": "https://accounts.google.com/o/oauth2/auth",
-#             "token_uri": "https://oauth2.googleapis.com/token",
-#             "redirect_uris": [settings.GOOGLE_OAUTH_REDIRECT_URI]
-#         }
-#     }, scopes=['https://www.googleapis.com/auth/drive.readonly'])
-#     flow.redirect_uri = settings.GOOGLE_OAUTH_REDIRECT_URI
-#     # Add access type and prompt parameters for refresh token
-#     flow.fetch_token(
-#         code=code,
-#         access_type='offline',
-#         prompt='consent'
-#     )
-#     credentials = flow.credentials
-#     return {
-#         "message": "Authentication successful",
-#         "credentials": credentials.to_json()
-#     }
 @app.get("/documents")
@@ -126,7 +112,7 @@ async def get_all_documents():
     """Get all documents from MongoDB"""
     try:
         documents = await mongodb.get_all_documents()
         formatted_documents = []
         for doc in documents:
             try:
@@ -140,9 +126,10 @@ async def get_all_documents():
                 }
                 formatted_documents.append(formatted_doc)
             except Exception as e:
-                logger.error(f"Error formatting document {doc.get('document_id', 'unknown')}: {str(e)}")
                 continue
         return {
             "total_documents": len(formatted_documents),
             "documents": formatted_documents
@@ -151,6 +138,7 @@ async def get_all_documents():
         logger.error(f"Error retrieving documents: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
 @app.get("/documents/{document_id}/download")
 async def get_document_file(document_id: str):
     """Serve a document file by its ID"""
@@ -159,27 +147,28 @@ async def get_document_file(document_id: str):
         doc = await mongodb.get_document(document_id)
         if not doc:
             raise HTTPException(status_code=404, detail="Document not found")
         # Extract filename from url_path
         filename = doc["url_path"].split("/")[-1]
         file_path = UPLOADS_DIR / filename
         if not file_path.exists():
             raise HTTPException(
-                status_code=404,
                 detail=f"File not found on server: {filename}"
             )
         return FileResponse(
             path=str(file_path),
             filename=doc["filename"],
             media_type=doc["content_type"]
         )
     except Exception as e:
         logger.error(f"Error serving document file: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
 @app.post("/documents/upload", response_model=BatchUploadResponse)
 async def upload_documents(
     files: List[UploadFile] = File(...),
@@ -189,14 +178,84 @@ async def upload_documents(
     try:
         vector_store, _ = await get_vector_store()
         response = await document_service.process_documents(
-            files,
-            vector_store,
             background_tasks
         )
         return response
     except Exception as e:
         logger.error(f"Error in document upload: {str(e)}")
-        raise HTTPException(status_code=500, detail=str(e))
 @app.get("/documentChunks/{document_id}")
@@ -205,10 +264,10 @@ async def get_document_chunks(document_id: str):
     try:
         vector_store, _ = await get_vector_store()
         chunks = vector_store.get_document_chunks(document_id)
         if not chunks:
             raise HTTPException(status_code=404, detail="Document not found")
         return {
             "document_id": document_id,
             "total_chunks": len(chunks),
@@ -218,53 +277,57 @@ async def get_document_chunks(document_id: str):
         logger.error(f"Error retrieving document chunks: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
 @app.delete("/documents/{document_id}")
 async def delete_document(document_id: str):
     """Delete document from MongoDB, ChromaDB, and physical storage"""
     try:
         # First get document details from MongoDB to get file path
         document = await mongodb.get_document(document_id)
-        if not document:
-            raise HTTPException(status_code=404, detail="Document not found")
         # Get vector store instance
         vector_store, _ = await get_vector_store()
         # Delete physical file using document service
         deletion_success = await document_service.delete_document(document_id)
         if not deletion_success:
-            logger.warning(f"Failed to delete physical file for document {document_id}")
         # Delete from vector store
         try:
             vector_store.delete_document(document_id)
         except Exception as e:
-            logger.error(f"Error deleting document from vector store: {str(e)}")
             raise HTTPException(
-                status_code=500,
                 detail=f"Failed to delete document from vector store: {str(e)}"
             )
         # Delete from MongoDB - don't check return value since document might already be deleted
         await mongodb.delete_document(document_id)
         return {
             "status": "success",
             "message": f"Document {document_id} successfully deleted from all stores"
         }
     except HTTPException:
         raise
     except Exception as e:
         logger.error(f"Error in delete_document endpoint: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
 @app.post("/processDriveDocuments")
 async def process_drive_documents():
     try:
         # Initialize vector store
         vector_store, _ = await get_vector_store()
         # Initialize Drive document processor
         drive_processor = DriveDocumentProcessor(
             google_service_account_path=settings.GOOGLE_SERVICE_ACCOUNT_PATH,
@@ -272,18 +335,19 @@ async def process_drive_documents():
             temp_dir=settings.TEMP_DOWNLOAD_DIR,
             doc_processor=doc_processor
         )
         # Process documents
         result = await drive_processor.process_documents(vector_store)
         return result
     except Exception as e:
         logger.error(f"Error in process_drive_documents: {str(e)}")
         raise HTTPException(
             status_code=500,
             detail=str(e)
         )
 @app.post("/user/contact", response_model=ChatResponse)
 async def create_user_contact(
     request: UserContactRequest,
@@ -296,7 +360,7 @@ async def create_user_contact(
             email=request.email,
             phone_number=request.phone_number
         )
         if existing_conversation_id:
             chat_request = ChatRequest(
                 query=f'An old user with name: "{request.full_name}", email: "{request.email}" and phone number: "{request.phone_number}" wants support again. This is Introduction Create a welcome back message for him and ask how i can help you today?',
@@ -315,7 +379,7 @@ async def create_user_contact(
                 email=request.email,
                 phone_number=request.phone_number
             )
             chat_request = ChatRequest(
                 query=f'A new user with name: "{request.full_name}", email: "{request.email}" and phone number: "{request.phone_number}" wants support. This is Introduction Create a welcome message for him and ask how i can help you today?',
                 llm_provider="openai",
@@ -324,14 +388,15 @@ async def create_user_contact(
                 stream=False,
                 conversation_id=new_conversation_id
             )
         # Call chat_endpoint with the prepared request
         return await chat_endpoint(chat_request, background_tasks)
     except Exception as e:
         logger.error(f"Error in create_user_contact: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
 @app.post("/chat", response_model=ChatResponse)
 async def chat_endpoint(
     request: ChatRequest,
@@ -340,33 +405,61 @@ async def chat_endpoint(
     """Chat endpoint with RAG support and enhanced Excel handling"""
     try:
         # Initialize core components
-        logger.info(f"Initializing vector store and embedding: {str(datetime.now())}")
         vector_store, embedding_model = await get_vector_store()
         logger.info(f"Initializing LLM: {str(datetime.now())}")
         llm = get_llm_instance(request.llm_provider)
-        # Use RAG agent manager to get singleton RAG agent
-        rag_agent = rag_agent_manager.get_rag_agent(
             llm=llm,
-            embedding_model=embedding_model,
             vector_store=vector_store,
             mongodb=mongodb
         )
         # Use provided conversation ID or create new one
         conversation_id = request.conversation_id or str(uuid.uuid4())
         # Process the query
         query = request.query
         # Add specific instructions for certain types of queries
-        #if "introduce" in query.lower() or "name" in query.lower() or "email" in query.lower():
-        #query += ". The response should be short and to the point. Make sure to not add any irrelevant information. make sure to share the response from Vector store, if you do not find information in vector store. Just respond I do not have information. Keep the introduction concise and friendly."
         # Generate response
         logger.info(f"Generating response: {str(datetime.now())}")
         max_retries = 3
         retry_count = 0
         response = None
@@ -378,7 +471,8 @@ async def chat_endpoint(
                     query=query,
                     conversation_id=conversation_id,
                     temperature=request.temperature,
-                    max_tokens=request.max_tokens if hasattr(request, 'max_tokens') else None
                 )
                 break
             except Exception as e:
@@ -388,7 +482,8 @@ async def chat_endpoint(
                 await asyncio.sleep(1)  # Brief pause before retry
         if response is None:
-            raise last_error or Exception("Failed to generate response after retries")
         logger.info(f"Response generated: {str(datetime.now())}")
@@ -401,13 +496,13 @@ async def chat_endpoint(
         # Add Excel-specific metadata if present
         has_excel_content = any(
-            doc and 'Sheet:' in doc
             for doc in (response.context_docs or [])
         )
         if has_excel_content:
             try:
                 metadata['excel_content'] = True
                 # Extract Excel-specific insights if available
                 if hasattr(rag_agent, 'get_excel_insights'):
                     excel_insights = rag_agent.get_excel_insights(
@@ -436,13 +531,14 @@ async def chat_endpoint(
             sources=response.sources,
             conversation_id=conversation_id,
             timestamp=datetime.now(),
-            relevant_doc_scores=response.scores if hasattr(response, 'scores') else None,
             metadata=metadata
         )
         # Log completion
         logger.info(f"Chat response completed: {str(datetime.now())}")
         return chat_response
     except Exception as e:
@@ -451,43 +547,48 @@ async def chat_endpoint(
         if isinstance(e, ValueError):
             raise HTTPException(status_code=400, detail=str(e))
         elif isinstance(e, (KeyError, AttributeError)):
-            raise HTTPException(status_code=500, detail="Internal processing error")
         else:
             raise HTTPException(status_code=500, detail=str(e))
 @app.get("/chat/history/{conversation_id}")
 async def get_conversation_history(conversation_id: str):
     """Get complete conversation history"""
     history = await mongodb.get_conversation_history(conversation_id)
     if not history:
         raise HTTPException(status_code=404, detail="Conversation not found")
     return {
         "conversation_id": conversation_id,
         "messages": history
     }
 @app.post("/chat/summarize", response_model=SummaryResponse)
 async def summarize_conversation(request: SummarizeRequest):
     """Generate a summary of a conversation"""
     try:
         messages = await mongodb.get_messages_for_summary(request.conversation_id)
         if not messages:
-            raise HTTPException(status_code=404, detail="Conversation not found")
         summary = await summarizer.summarize_conversation(
             messages,
             include_metadata=request.include_metadata
         )
         return SummaryResponse(**summary)
     except Exception as e:
         logger.error(f"Error generating summary: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
 @app.post("/chat/feedback/{conversation_id}")
 async def submit_feedback(
     conversation_id: str,
@@ -498,21 +599,22 @@ async def submit_feedback(
         # Validate conversation exists
         conversation = await mongodb.get_conversation_metadata(conversation_id)
         if not conversation:
-            raise HTTPException(status_code=404, detail="Conversation not found")
         # Update feedback
         success = await mongodb.update_feedback(
             conversation_id=conversation_id,
             feedback=feedback_request.feedback,
             rating=feedback_request.rating
         )
         if not success:
             raise HTTPException(
                 status_code=500,
                 detail="Failed to update feedback"
             )
         return {
             "status": "success",
             "message": "Feedback submitted successfully",
@@ -522,20 +624,21 @@ async def submit_feedback(
                 "rating": feedback_request.format_rating()
             }
         }
     except HTTPException:
         raise
     except Exception as e:
         logger.error(f"Error submitting feedback: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
 @app.get("/debug/config")
 async def debug_config():
     """Debug endpoint to check configuration"""
     import os
     from config.config import settings
     from pathlib import Path
     debug_info = {
         "environment_variables": {
             "OPENAI_API_KEY": "[SET]" if os.getenv('OPENAI_API_KEY') else "[NOT SET]",
@@ -550,16 +653,17 @@ async def debug_config():
             "openai_config_exists": (Path.home() / '.openai' / 'api_key').exists()
         }
     }
     if settings.OPENAI_API_KEY:
         key = settings.OPENAI_API_KEY
         debug_info["api_key_info"] = {
             "length": len(key),
             "preview": f"{key[:4]}...{key[-4:]}" if len(key) > 8 else "[INVALID LENGTH]"
         }
     return debug_info
 @app.post("/admin/cleanup")
 async def cleanup_databases(
     include_files: bool = True,
@@ -567,20 +671,36 @@ async def cleanup_databases(
 ):
     """
     Clean up all data from ChromaDB and MongoDB
     Args:
         include_files (bool): Whether to also delete uploaded files
     """
     try:
         result = await perform_cleanup(mongodb, include_files)
         return result
     except Exception as e:
         logger.error(f"Error in cleanup operation: {str(e)}")
         raise HTTPException(
             status_code=500,
             detail=f"Error during cleanup: {str(e)}"
         )
 @app.get("/health")
 async def health_check():
     """Health check endpoint"""
@@ -588,4 +708,4 @@ async def health_check():
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8000)

 # src/main.py
+from config.config import settings
+from src.utils.database_cleanup import perform_cleanup
+from fastapi.security import APIKeyHeader
+from fastapi import HTTPException, Depends
+from fastapi.responses import JSONResponse
+from src.models import (
+    ChatRequest,
+    ChatResponse,
+    BatchUploadResponse,
+    SummarizeRequest,
+    SummaryResponse,
+    FeedbackRequest
+)
+from src.implementations.document_service import DocumentService
+from src.db.mongodb_store import MongoDBStore
+from src.utils.llm_utils import get_llm_instance, get_vector_store
+from src.utils.logger import logger
+from src.utils.conversation_summarizer import ConversationSummarizer
+from src.utils.drive_document_processor import DriveDocumentProcessor
+from src.utils.document_processor import DocumentProcessor
+from src.models.UserContact import UserContactRequest
+from src.models.document import AllDocumentsResponse, StoredDocument
+from src.agents.system_instructions_rag import SystemInstructionsRAGAgent
+from src.utils.google_drive_service import GoogleDriveService
+from google_auth_oauthlib.flow import Flow
+from google.oauth2.credentials import Credentials
+from fastapi.responses import RedirectResponse
 from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks
 from fastapi.responses import StreamingResponse, FileResponse
 from fastapi.staticfiles import StaticFiles
 from pathlib import Path
 import os
 import asyncio
+import chromadb
+from pathlib import Path
+import asyncio
+import gc
+import random
+from typing import List
+from src.utils.logger import logger
+from config.config import settings
+os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1'
+# os.environ["OAUTHLIB_RELAX_TOKEN_SCOPE"] = "1"
 # Import custom modules1
+# from src.agents.rag_agent import RAGAgent
 app = FastAPI(title="Chatbot API")
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["http://localhost:8080",
+                   "http://localhost:3000"],  # Add both ports
     allow_credentials=True,
     allow_methods=["*"],  # Allows all methods
     allow_headers=["*"],  # Allows all headers
 )
+# google_drive_service = GoogleDriveService()
 # Initialize MongoDB
 mongodb = MongoDBStore(settings.MONGODB_URI)
 # Security setup
 API_KEY_HEADER = APIKeyHeader(name="ADMIN_API_KEY")
 async def verify_api_key(api_key: str = Depends(API_KEY_HEADER)):
     """Verify admin API key"""
     if not settings.ADMIN_API_KEY or api_key != settings.ADMIN_API_KEY:
         )
     return api_key
+def get_chroma_client():
+    """Get a new ChromaDB client instance"""
+    return chromadb.PersistentClient(
+        path=settings.CHROMA_PATH,
+        settings=chromadb.Settings(
+            allow_reset=True,
+            is_persistent=True
+        )
+    )
 @app.get("/documents")
     """Get all documents from MongoDB"""
     try:
         documents = await mongodb.get_all_documents()
         formatted_documents = []
         for doc in documents:
             try:
                 }
                 formatted_documents.append(formatted_doc)
             except Exception as e:
+                logger.error(
+                    f"Error formatting document {doc.get('document_id', 'unknown')}: {str(e)}")
                 continue
         return {
             "total_documents": len(formatted_documents),
             "documents": formatted_documents
         logger.error(f"Error retrieving documents: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
 @app.get("/documents/{document_id}/download")
 async def get_document_file(document_id: str):
     """Serve a document file by its ID"""
         doc = await mongodb.get_document(document_id)
         if not doc:
             raise HTTPException(status_code=404, detail="Document not found")
         # Extract filename from url_path
         filename = doc["url_path"].split("/")[-1]
         file_path = UPLOADS_DIR / filename
         if not file_path.exists():
             raise HTTPException(
+                status_code=404,
                 detail=f"File not found on server: {filename}"
             )
         return FileResponse(
             path=str(file_path),
             filename=doc["filename"],
             media_type=doc["content_type"]
         )
     except Exception as e:
         logger.error(f"Error serving document file: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
 @app.post("/documents/upload", response_model=BatchUploadResponse)
 async def upload_documents(
     files: List[UploadFile] = File(...),
     try:
         vector_store, _ = await get_vector_store()
         response = await document_service.process_documents(
+            files,
+            vector_store,
             background_tasks
         )
         return response
     except Exception as e:
         logger.error(f"Error in document upload: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/documentChunks")
+async def get_all_document_chunks():
+    """Get all document chunks from the vector store"""
+    try:
+        # Get vector store instance
+        vector_store, _ = await get_vector_store()
+        # Retrieve all documents
+        all_documents = vector_store.get_all_documents()
+        # If no documents, return a structured response instead of raising an exception
+        if not all_documents:
+            return {
+                "total_documents": 0,
+                "documents": [],
+                "message": "No documents are currently stored in the vector store. Upload some documents to see chunks."
+            }
+        # Group chunks by document_id
+        document_chunks = {}
+        for doc in all_documents:
+            # Safely extract document_id
+            document_id = doc.get('metadata', {}).get('document_id',
+                                                      doc.get('id',
+                                                              str(uuid.uuid4())))
+            # Ensure metadata is a dictionary
+            metadata = doc.get('metadata', {}) if isinstance(
+                doc.get('metadata'), dict) else {}
+            # Create chunk entry
+            chunk = {
+                'text': str(doc.get('text', '')),
+                'metadata': metadata
+            }
+            # Group chunks by document_id
+            if document_id not in document_chunks:
+                document_chunks[document_id] = []
+            document_chunks[document_id].append(chunk)
+        # Prepare response
+        processed_documents = []
+        for doc_id, chunks in document_chunks.items():
+            processed_documents.append({
+                "document_id": doc_id,
+                "total_chunks": len(chunks),
+                "chunks": chunks
+            })
+        return {
+            "total_documents": len(processed_documents),
+            "documents": processed_documents,
+            "message": f"Successfully retrieved {len(processed_documents)} documents"
+        }
+    except Exception as e:
+        # Log the full error for debugging
+        logger.error(
+            f"Error retrieving all document chunks: {str(e)}", exc_info=True)
+        # Return a structured error response
+        return {
+            "total_documents": 0,
+            "documents": [],
+            "message": f"An error occurred while retrieving document chunks: {str(e)}"
+        }
 @app.get("/documentChunks/{document_id}")
     try:
         vector_store, _ = await get_vector_store()
         chunks = vector_store.get_document_chunks(document_id)
         if not chunks:
             raise HTTPException(status_code=404, detail="Document not found")
         return {
             "document_id": document_id,
             "total_chunks": len(chunks),
         logger.error(f"Error retrieving document chunks: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
 @app.delete("/documents/{document_id}")
 async def delete_document(document_id: str):
     """Delete document from MongoDB, ChromaDB, and physical storage"""
     try:
         # First get document details from MongoDB to get file path
         document = await mongodb.get_document(document_id)
+        # if not document:
+        #    raise HTTPException(status_code=404, detail="Document not found")
         # Get vector store instance
         vector_store, _ = await get_vector_store()
         # Delete physical file using document service
         deletion_success = await document_service.delete_document(document_id)
         if not deletion_success:
+            logger.warning(
+                f"Failed to delete physical file for document {document_id}")
         # Delete from vector store
         try:
             vector_store.delete_document(document_id)
         except Exception as e:
+            logger.error(
+                f"Error deleting document from vector store: {str(e)}")
             raise HTTPException(
+                status_code=500,
                 detail=f"Failed to delete document from vector store: {str(e)}"
             )
         # Delete from MongoDB - don't check return value since document might already be deleted
         await mongodb.delete_document(document_id)
         return {
             "status": "success",
             "message": f"Document {document_id} successfully deleted from all stores"
         }
     except HTTPException:
         raise
     except Exception as e:
         logger.error(f"Error in delete_document endpoint: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
 @app.post("/processDriveDocuments")
 async def process_drive_documents():
     try:
         # Initialize vector store
         vector_store, _ = await get_vector_store()
         # Initialize Drive document processor
         drive_processor = DriveDocumentProcessor(
             google_service_account_path=settings.GOOGLE_SERVICE_ACCOUNT_PATH,
             temp_dir=settings.TEMP_DOWNLOAD_DIR,
             doc_processor=doc_processor
         )
         # Process documents
         result = await drive_processor.process_documents(vector_store)
         return result
     except Exception as e:
         logger.error(f"Error in process_drive_documents: {str(e)}")
         raise HTTPException(
             status_code=500,
             detail=str(e)
         )
 @app.post("/user/contact", response_model=ChatResponse)
 async def create_user_contact(
     request: UserContactRequest,
             email=request.email,
             phone_number=request.phone_number
         )
         if existing_conversation_id:
             chat_request = ChatRequest(
                 query=f'An old user with name: "{request.full_name}", email: "{request.email}" and phone number: "{request.phone_number}" wants support again. This is Introduction Create a welcome back message for him and ask how i can help you today?',
                 email=request.email,
                 phone_number=request.phone_number
             )
             chat_request = ChatRequest(
                 query=f'A new user with name: "{request.full_name}", email: "{request.email}" and phone number: "{request.phone_number}" wants support. This is Introduction Create a welcome message for him and ask how i can help you today?',
                 llm_provider="openai",
                 stream=False,
                 conversation_id=new_conversation_id
             )
         # Call chat_endpoint with the prepared request
         return await chat_endpoint(chat_request, background_tasks)
     except Exception as e:
         logger.error(f"Error in create_user_contact: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
 @app.post("/chat", response_model=ChatResponse)
 async def chat_endpoint(
     request: ChatRequest,
     """Chat endpoint with RAG support and enhanced Excel handling"""
     try:
         # Initialize core components
+        logger.info(
+            f"Initializing vector store and embedding: {str(datetime.now())}")
         vector_store, embedding_model = await get_vector_store()
         logger.info(f"Initializing LLM: {str(datetime.now())}")
         llm = get_llm_instance(request.llm_provider)
+        # Initialize RAG agent
+        # rag_agent = RAGAgent(
+        #     llm=llm,
+        #     embedding=embedding_model,
+        #     vector_store=vector_store,
+        #     mongodb=mongodb
+        # )
+        rag_agent = SystemInstructionsRAGAgent(
             llm=llm,
+            embedding=embedding_model,
             vector_store=vector_store,
             mongodb=mongodb
         )
+        # rag_agent.add_custom_role(
+        #     "Knowledge based chatbot and introduction specialist",
+        #     """You are a welcome agent with knowledge based specialist focusing on knowledge attached and create a beautiful welcome message.
+        #     Your role is to:
+        #     1. Your response should be short and to the point.
+        #     2. Strictly follow this point for If it is an introduction. You strictly respond that "Welcome name of customer to our platform. How can I help you today?"
+        #     """
+        # )
+        # rag_agent.add_custom_role(
+        #     "Knowledge based chatbot",
+        #     """You are a knowledge based specialist focusing on knowledge attached.
+        #     Your role is to:
+        #     1. Your response should be short and to the point.
+        #     2. if it is not introduction then make sure to share the response from Vector store.
+        #     3. If you do not find relevant information. Just say I do not have this information but this do not apply to introduction message.
+        #     4. If there is an introduction, you should ignore above roles and connect with LLm to have a welcome message for the user.
+        #     """
+        # )
         # Use provided conversation ID or create new one
         conversation_id = request.conversation_id or str(uuid.uuid4())
         # Process the query
         query = request.query
         # Add specific instructions for certain types of queries
+        # if "introduce" in query.lower() or "name" in query.lower() or "email" in query.lower():
+        # query += ". The response should be short and to the point. Make sure to not add any irrelevant information. make sure to share the response from Vector store, if you do not find information in vector store. Just respond I do not have information. Keep the introduction concise and friendly."
         # Generate response
         logger.info(f"Generating response: {str(datetime.now())}")
         max_retries = 3
         retry_count = 0
         response = None
                     query=query,
                     conversation_id=conversation_id,
                     temperature=request.temperature,
+                    max_tokens=request.max_tokens if hasattr(
+                        request, 'max_tokens') else None
                 )
                 break
             except Exception as e:
                 await asyncio.sleep(1)  # Brief pause before retry
         if response is None:
+            raise last_error or Exception(
+                "Failed to generate response after retries")
         logger.info(f"Response generated: {str(datetime.now())}")
         # Add Excel-specific metadata if present
         has_excel_content = any(
+            doc and 'Sheet:' in doc
             for doc in (response.context_docs or [])
         )
         if has_excel_content:
             try:
                 metadata['excel_content'] = True
                 # Extract Excel-specific insights if available
                 if hasattr(rag_agent, 'get_excel_insights'):
                     excel_insights = rag_agent.get_excel_insights(
             sources=response.sources,
             conversation_id=conversation_id,
             timestamp=datetime.now(),
+            relevant_doc_scores=response.scores if hasattr(
+                response, 'scores') else None,
             metadata=metadata
         )
         # Log completion
         logger.info(f"Chat response completed: {str(datetime.now())}")
         return chat_response
     except Exception as e:
         if isinstance(e, ValueError):
             raise HTTPException(status_code=400, detail=str(e))
         elif isinstance(e, (KeyError, AttributeError)):
+            raise HTTPException(
+                status_code=500, detail="Internal processing error")
         else:
             raise HTTPException(status_code=500, detail=str(e))
 @app.get("/chat/history/{conversation_id}")
 async def get_conversation_history(conversation_id: str):
     """Get complete conversation history"""
     history = await mongodb.get_conversation_history(conversation_id)
     if not history:
         raise HTTPException(status_code=404, detail="Conversation not found")
     return {
         "conversation_id": conversation_id,
         "messages": history
     }
 @app.post("/chat/summarize", response_model=SummaryResponse)
 async def summarize_conversation(request: SummarizeRequest):
     """Generate a summary of a conversation"""
     try:
         messages = await mongodb.get_messages_for_summary(request.conversation_id)
         if not messages:
+            raise HTTPException(
+                status_code=404, detail="Conversation not found")
         summary = await summarizer.summarize_conversation(
             messages,
             include_metadata=request.include_metadata
         )
         return SummaryResponse(**summary)
     except Exception as e:
         logger.error(f"Error generating summary: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
 @app.post("/chat/feedback/{conversation_id}")
 async def submit_feedback(
     conversation_id: str,
         # Validate conversation exists
         conversation = await mongodb.get_conversation_metadata(conversation_id)
         if not conversation:
+            raise HTTPException(
+                status_code=404, detail="Conversation not found")
         # Update feedback
         success = await mongodb.update_feedback(
             conversation_id=conversation_id,
             feedback=feedback_request.feedback,
             rating=feedback_request.rating
         )
         if not success:
             raise HTTPException(
                 status_code=500,
                 detail="Failed to update feedback"
             )
         return {
             "status": "success",
             "message": "Feedback submitted successfully",
                 "rating": feedback_request.format_rating()
             }
         }
     except HTTPException:
         raise
     except Exception as e:
         logger.error(f"Error submitting feedback: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
 @app.get("/debug/config")
 async def debug_config():
     """Debug endpoint to check configuration"""
     import os
     from config.config import settings
     from pathlib import Path
     debug_info = {
         "environment_variables": {
             "OPENAI_API_KEY": "[SET]" if os.getenv('OPENAI_API_KEY') else "[NOT SET]",
             "openai_config_exists": (Path.home() / '.openai' / 'api_key').exists()
         }
     }
     if settings.OPENAI_API_KEY:
         key = settings.OPENAI_API_KEY
         debug_info["api_key_info"] = {
             "length": len(key),
             "preview": f"{key[:4]}...{key[-4:]}" if len(key) > 8 else "[INVALID LENGTH]"
         }
     return debug_info
 @app.post("/admin/cleanup")
 async def cleanup_databases(
     include_files: bool = True,
 ):
     """
     Clean up all data from ChromaDB and MongoDB
     Args:
         include_files (bool): Whether to also delete uploaded files
+    Returns:
+        Dict: Cleanup operation summary with restart information
     """
     try:
         result = await perform_cleanup(mongodb, include_files)
+        # If restart is needed, return 202 Accepted instead of 200 OK
+        if result.get("restart_needed"):
+            return JSONResponse(
+                status_code=202,
+                content={
+                    **result,
+                    "message": "Cleanup partially completed. Please restart the server to complete ChromaDB cleanup."
+                }
+            )
         return result
     except Exception as e:
         logger.error(f"Error in cleanup operation: {str(e)}")
         raise HTTPException(
             status_code=500,
             detail=f"Error during cleanup: {str(e)}"
         )
 @app.get("/health")
 async def health_check():
     """Health check endpoint"""
 if __name__ == "__main__":
     import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)

src/utils/__pycache__/database_cleanup.cpython-312.pyc CHANGED Viewed

Binary files a/src/utils/__pycache__/database_cleanup.cpython-312.pyc and b/src/utils/__pycache__/database_cleanup.cpython-312.pyc differ

src/utils/__pycache__/document_processor.cpython-312.pyc CHANGED Viewed

Binary files a/src/utils/__pycache__/document_processor.cpython-312.pyc and b/src/utils/__pycache__/document_processor.cpython-312.pyc differ

src/utils/database_cleanup.py CHANGED Viewed

@@ -1,131 +1,180 @@
 # src/utils/database_cleanup.py
-from typing import List, Dict
 import chromadb
 import shutil
 from pathlib import Path
 from src.utils.logger import logger
 from config.config import settings
-async def cleanup_chroma():
-    """Clean up ChromaDB vector store"""
     try:
-        # Initialize client with allow_reset=True
         client = chromadb.PersistentClient(
             path=settings.CHROMA_PATH,
             settings=chromadb.Settings(
                 allow_reset=True,
-                is_persistent=True
             )
         )
-        # Get collection names
-        collection_names = client.list_collections()
-        # Delete each collection by name
-        for name in collection_names:
-            client.delete_collection(name)
-        # Reset client
-        client.reset()
-        # Remove persistence directory
-        path = Path(settings.CHROMA_PATH)
-        if path.exists():
-            shutil.rmtree(path)
-        return ["All vector store data cleared"]
     except Exception as e:
         raise Exception(f"ChromaDB cleanup failed: {str(e)}")
 async def cleanup_mongodb(mongodb) -> List[str]:
-    """
-    Clean up MongoDB collections
-    Args:
-        mongodb: MongoDB store instance
-    Returns:
-        List[str]: Details of cleanup operations
-    """
     details = []
     try:
-        # Drop all collections
-        await mongodb.chat_history.delete_many({})
-        details.append("Cleared chat history")
-        await mongodb.conversations.delete_many({})
-        details.append("Cleared conversations")
-        await mongodb.documents.delete_many({})
-        details.append("Cleared document metadata")
-        await mongodb.knowledge_base.delete_many({})
-        details.append("Cleared knowledge base")
-        if hasattr(mongodb.db, 'vector_metadata'):
-            await mongodb.db.vector_metadata.delete_many({})
-            details.append("Cleared vector metadata")
         return details
     except Exception as e:
         raise Exception(f"MongoDB cleanup failed: {str(e)}")
 async def cleanup_files() -> List[str]:
-    """
-    Clean up uploaded files
-    Returns:
-        List[str]: Details of cleanup operations
-    """
     details = []
-    uploads_dir = Path("uploads")
-    if uploads_dir.exists():
-        # Get list of files before deletion
-        files = list(uploads_dir.glob('*'))
-        # Delete all files
-        for file in files:
-            if file.is_file():
-                file.unlink()
-                details.append(f"Deleted file: {file.name}")
-        # Try to remove the directory itself
-        if not any(uploads_dir.iterdir()):
-            uploads_dir.rmdir()
-            details.append("Removed empty uploads directory")
-    else:
-        details.append("No uploads directory found")
     return details
-async def perform_cleanup(
-    mongodb,
-    include_files: bool = True
-) -> Dict:
     """
-    Perform comprehensive cleanup of all databases
     Args:
         mongodb: MongoDB store instance
         include_files (bool): Whether to also delete uploaded files
     Returns:
-        Dict: Cleanup operation summary
     """
     cleanup_summary = {
         "chroma_db": {"status": "not_started", "details": []},
         "mongodb": {"status": "not_started", "details": []},
         "files": {"status": "not_started", "details": []}
     }
     try:
         # Clean ChromaDB
         try:
-            details = await cleanup_chroma()
             cleanup_summary["chroma_db"] = {
-                "status": "success",
                 "details": details
             }
         except Exception as e:
@@ -166,17 +215,21 @@ async def perform_cleanup(
         # Determine overall status
         overall_status = "success"
-        if any(item["status"] == "error" for item in cleanup_summary.values()):
             overall_status = "partial_success"
-        if all(item["status"] == "error" for item in cleanup_summary.values()):
             overall_status = "error"
         return {
             "status": overall_status,
-            "message": "Cleanup operation completed",
-            "details": cleanup_summary
         }
     except Exception as e:
         logger.error(f"Error in cleanup operation: {str(e)}")
-        raise

 # src/utils/database_cleanup.py
 import chromadb
 import shutil
 from pathlib import Path
+import asyncio
+import gc
+import random
+from typing import List, Dict, Tuple
 from src.utils.logger import logger
 from config.config import settings
+async def cleanup_chroma() -> Tuple[List[str], bool]:
+    """Clean up ChromaDB data while maintaining connection"""
+    details = []
+    restart_needed = False
     try:
+        # Get existing client
         client = chromadb.PersistentClient(
             path=settings.CHROMA_PATH,
             settings=chromadb.Settings(
                 allow_reset=True,
+                is_persistent=True,
+                anonymized_telemetry=False
             )
         )
+        # Get all collections
+        collections = client.list_collections()
+        if not collections:
+            details.append("No collections found in ChromaDB")
+            return details, restart_needed
+        # Delete data from each collection
+        for collection in collections:
+            try:
+                # Get all IDs in the collection
+                all_ids = collection.get()['ids']
+                if all_ids:
+                    # Delete all documents in the collection
+                    collection.delete(ids=all_ids)
+                    details.append(
+                        f"Deleted {len(all_ids)} documents from collection {collection.name}")
+                else:
+                    details.append(
+                        f"Collection {collection.name} was already empty")
+                # Delete the collection itself
+                client.delete_collection(collection.name)
+                details.append(f"Deleted collection {collection.name}")
+            except Exception as e:
+                logger.warning(
+                    f"Error cleaning collection {collection.name}: {str(e)}")
+                details.append(
+                    f"Error cleaning collection {collection.name}: {str(e)}")
+                restart_needed = True  # Set restart flag if any collection fails
+        # Optional: Add a check to see if a full reset might be necessary
+        if len(client.list_collections()) > 0:
+            restart_needed = True
+            details.append("Some collections might require manual reset")
+        return details, restart_needed
     except Exception as e:
         raise Exception(f"ChromaDB cleanup failed: {str(e)}")
 async def cleanup_mongodb(mongodb) -> List[str]:
+    """Clean up MongoDB collections"""
     details = []
     try:
+        # Get all collections in the database
+        collections = await mongodb.db.list_collection_names()
+        # Core collections from MongoDBStore initialization
+        core_collections = {
+            'chat_history': mongodb.chat_history,
+            'conversations': mongodb.conversations,
+            'knowledge_base': mongodb.documents,  # documents maps to knowledge_base
+            # Direct access to vector_metadata collection
+            'vector_metadata': mongodb.db.vector_metadata,
+        }
+        # Clean each core collection
+        for name, collection in core_collections.items():
+            try:
+                result = await collection.delete_many({})
+                details.append(
+                    f"Cleared {name} ({result.deleted_count} documents)")
+            except Exception as e:
+                logger.error(f"Error clearing {name}: {str(e)}")
+                details.append(f"Error clearing {name}: {str(e)}")
+        # Clean any additional collections not in the core set
+        for coll_name in collections:
+            if coll_name not in core_collections:
+                try:
+                    result = await mongodb.db[coll_name].delete_many({})
+                    details.append(
+                        f"Cleared additional collection {coll_name} ({result.deleted_count} documents)")
+                except Exception as e:
+                    logger.error(
+                        f"Error clearing additional collection {coll_name}: {str(e)}")
         return details
     except Exception as e:
         raise Exception(f"MongoDB cleanup failed: {str(e)}")
 async def cleanup_files() -> List[str]:
+    """Clean up uploaded files and temporary directories"""
     details = []
+    # Directories to clean
+    directories = {
+        'uploads': Path("uploads"),
+        'temp_downloads': Path(settings.TEMP_DOWNLOAD_DIR),
+        # Additional temp directory used by some components
+        'temp_dir': Path('./temp')
+    }
+    for dir_name, dir_path in directories.items():
+        if dir_path.exists():
+            try:
+                # Delete all files in the directory
+                for file in dir_path.glob('*'):
+                    try:
+                        if file.is_file():
+                            file.unlink()
+                            details.append(
+                                f"Deleted file: {file.name} from {dir_name}")
+                    except Exception as e:
+                        details.append(
+                            f"Error deleting file {file.name} from {dir_name}: {str(e)}")
+                # Try to remove the empty directory
+                if not any(dir_path.iterdir()):
+                    dir_path.rmdir()
+                    details.append(f"Removed empty {dir_name} directory")
+            except Exception as e:
+                details.append(
+                    f"Error cleaning {dir_name} directory: {str(e)}")
+        else:
+            details.append(f"No {dir_name} directory found")
     return details
+async def perform_cleanup(mongodb, include_files: bool = True) -> Dict:
     """
+    Perform comprehensive cleanup of all databases and files
     Args:
         mongodb: MongoDB store instance
         include_files (bool): Whether to also delete uploaded files
     Returns:
+        Dict: Cleanup operation summary with detailed status
     """
     cleanup_summary = {
         "chroma_db": {"status": "not_started", "details": []},
         "mongodb": {"status": "not_started", "details": []},
         "files": {"status": "not_started", "details": []}
     }
     try:
         # Clean ChromaDB
         try:
+            details, restart_needed = await cleanup_chroma()
             cleanup_summary["chroma_db"] = {
+                "status": "success" if not restart_needed else "partial",
                 "details": details
             }
         except Exception as e:
         # Determine overall status
         overall_status = "success"
+        if restart_needed:
+            overall_status = "partial_success"
+            cleanup_summary["message"] = "Cleanup partially completed. Server restart required to complete ChromaDB cleanup."
+        elif any(item["status"] == "error" for item in cleanup_summary.values()):
             overall_status = "partial_success"
+        elif all(item["status"] == "error" for item in cleanup_summary.values()):
             overall_status = "error"
         return {
             "status": overall_status,
+            "message": cleanup_summary.get("message", "Cleanup operation completed"),
+            "details": cleanup_summary,
+            "restart_needed": restart_needed
         }
     except Exception as e:
         logger.error(f"Error in cleanup operation: {str(e)}")
+        raise

src/utils/document_processor.py CHANGED Viewed

@@ -6,7 +6,7 @@ import pandas as pd
 import json
 from pathlib import Path
 import hashlib
-import magic  # python-magic library for file type detection
 from bs4 import BeautifulSoup
 import csv
 from datetime import datetime
@@ -16,41 +16,92 @@ import tiktoken
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 import logging
 from bs4.element import ProcessingInstruction
 from .enhanced_excel_processor import EnhancedExcelProcessor
 class DocumentProcessor:
     def __init__(
         self,
-        chunk_size: int = 1000,
-        chunk_overlap: int = 200,
-        max_file_size: int = 10 * 1024 * 1024,  # 10MB
         supported_formats: Optional[List[str]] = None
     ):
-        self.chunk_size = chunk_size
-        self.chunk_overlap = chunk_overlap
-        self.max_file_size = max_file_size
-        self.supported_formats = supported_formats or [
-            '.txt', '.pdf', '.docx', '.csv', '.json',
-            '.html', '.md', '.xml', '.rtf', '.xlsx', '.xls'
-        ]
         self.processing_queue = Queue()
         self.processed_docs = {}
         self._initialize_text_splitter()
-        # Initialize Excel processor
         self.excel_processor = EnhancedExcelProcessor()
-        # Check for required packages
         try:
             import striprtf.striprtf
         except ImportError:
-            logging.warning("Warning: striprtf package not found. RTF support will be limited.")
         try:
             from bs4 import BeautifulSoup
             import lxml
         except ImportError:
-            logging.warning("Warning: beautifulsoup4 or lxml package not found. XML support will be limited.")
     def _initialize_text_splitter(self):
         """Initialize the text splitter with custom settings"""
@@ -58,13 +109,241 @@ class DocumentProcessor:
             chunk_size=self.chunk_size,
             chunk_overlap=self.chunk_overlap,
             length_function=len,
-            separators=["\n\n", "\n", " ", ""]
         )
     def _extract_content(self, file_path: Path) -> str:
         """Extract content from different file formats"""
         suffix = file_path.suffix.lower()
         try:
             if suffix == '.pdf':
                 return self._extract_pdf(file_path)
@@ -87,7 +366,8 @@ class DocumentProcessor:
             else:
                 raise ValueError(f"Unsupported format: {suffix}")
         except Exception as e:
-            raise Exception(f"Error extracting content from {file_path}: {str(e)}")
     def _extract_text(self, file_path: Path) -> str:
         """Extract content from text-based files"""
@@ -104,31 +384,31 @@ class DocumentProcessor:
         with open(file_path, 'rb') as file:
             reader = PyPDF2.PdfReader(file)
             metadata = reader.metadata
             for page in reader.pages:
                 text += page.extract_text() + "\n\n"
                 # Extract images if available
                 if '/XObject' in page['/Resources']:
                     for obj in page['/Resources']['/XObject'].get_object():
                         if page['/Resources']['/XObject'][obj]['/Subtype'] == '/Image':
                             pass
         return text.strip()
     def _extract_docx(self, file_path: Path) -> str:
         """Extract text from DOCX with formatting"""
         doc = docx.Document(file_path)
         full_text = []
         for para in doc.paragraphs:
             full_text.append(para.text)
         for table in doc.tables:
             for row in table.rows:
                 row_text = [cell.text for cell in row.cells]
                 full_text.append(" | ".join(row_text))
         return "\n\n".join(full_text)
     def _extract_csv(self, file_path: Path) -> str:
@@ -146,10 +426,10 @@ class DocumentProcessor:
         """Extract text from HTML with structure preservation"""
         with open(file_path) as f:
             soup = BeautifulSoup(f, 'html.parser')
         for script in soup(["script", "style"]):
             script.decompose()
         text = soup.get_text(separator='\n')
         lines = [line.strip() for line in text.splitlines() if line.strip()]
         return "\n\n".join(lines)
@@ -159,12 +439,13 @@ class DocumentProcessor:
         try:
             with open(file_path, 'r', encoding='utf-8') as f:
                 soup = BeautifulSoup(f, 'xml')
             for pi in soup.find_all(text=lambda text: isinstance(text, ProcessingInstruction)):
                 pi.extract()
             text = soup.get_text(separator='\n')
-            lines = [line.strip() for line in text.splitlines() if line.strip()]
             return "\n\n".join(lines)
         except Exception as e:
             raise Exception(f"Error processing XML file: {str(e)}")
@@ -173,12 +454,13 @@ class DocumentProcessor:
         """Extract text from RTF files"""
         try:
             import striprtf.striprtf as striprtf
             with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                 rtf_text = f.read()
             plain_text = striprtf.rtf_to_text(rtf_text)
-            lines = [line.strip() for line in plain_text.splitlines() if line.strip()]
             return "\n\n".join(lines)
         except ImportError:
             raise ImportError("striprtf package is required for RTF support.")
@@ -190,14 +472,15 @@ class DocumentProcessor:
         try:
             # Use enhanced Excel processor
             processed_content = self.excel_processor.process_excel(file_path)
             # If processing fails, fall back to basic processing
             if not processed_content:
-                logging.warning(f"Enhanced Excel processing failed for {file_path}, falling back to basic processing")
                 return self._basic_excel_extract(file_path)
             return processed_content
         except Exception as e:
             logging.error(f"Error in enhanced Excel processing: {str(e)}")
             # Fall back to basic Excel processing
@@ -208,12 +491,12 @@ class DocumentProcessor:
         try:
             excel_file = pd.ExcelFile(file_path)
             sheets_data = []
             for sheet_name in excel_file.sheet_names:
                 df = pd.read_excel(excel_file, sheet_name=sheet_name)
                 sheet_content = f"\nSheet: {sheet_name}\n"
                 sheet_content += "=" * (len(sheet_name) + 7) + "\n"
                 if df.empty:
                     sheet_content += "Empty Sheet\n"
                 else:
@@ -223,11 +506,11 @@ class DocumentProcessor:
                         max_cols=None,
                         line_width=120
                     ) + "\n"
                 sheets_data.append(sheet_content)
             return "\n\n".join(sheets_data)
         except Exception as e:
             raise Exception(f"Error in basic Excel processing: {str(e)}")
@@ -239,7 +522,7 @@ class DocumentProcessor:
     ) -> Dict:
         """Generate comprehensive metadata"""
         file_stat = file_path.stat()
         metadata = {
             'filename': file_path.name,
             'file_type': file_path.suffix,
@@ -252,7 +535,7 @@ class DocumentProcessor:
             'character_count': len(content),
             'processing_timestamp': datetime.now().isoformat()
         }
         # Add Excel-specific metadata if applicable
         if file_path.suffix.lower() in ['.xlsx', '.xls']:
             try:
@@ -261,32 +544,42 @@ class DocumentProcessor:
                     metadata.update({'excel_metadata': excel_metadata})
             except Exception as e:
                 logging.warning(f"Could not extract Excel metadata: {str(e)}")
         if additional_metadata:
             metadata.update(additional_metadata)
         return metadata
     def _calculate_hash(self, text: str) -> str:
         """Calculate SHA-256 hash of text"""
         return hashlib.sha256(text.encode()).hexdigest()
-    async def process_document(
-        self,
-        file_path: Union[str, Path],
-        metadata: Optional[Dict] = None
-    ) -> Dict:
         """Process a document with metadata and content extraction"""
         file_path = Path(file_path)
         if not self._validate_file(file_path):
             raise ValueError(f"Invalid file: {file_path}")
         content = self._extract_content(file_path)
         doc_metadata = self._generate_metadata(file_path, content, metadata)
-        chunks = self.text_splitter.split_text(content)
         chunk_hashes = [self._calculate_hash(chunk) for chunk in chunks]
         return {
             'content': content,
             'chunks': chunks,
@@ -295,20 +588,28 @@ class DocumentProcessor:
             'statistics': self._generate_statistics(content, chunks)
         }
     def _validate_file(self, file_path: Path) -> bool:
         """Validate file type, size, and content"""
         if not file_path.exists():
             raise FileNotFoundError(f"File not found: {file_path}")
         if file_path.suffix.lower() not in self.supported_formats:
             raise ValueError(f"Unsupported file format: {file_path.suffix}")
         if file_path.stat().st_size > self.max_file_size:
             raise ValueError(f"File too large: {file_path}")
         if file_path.stat().st_size == 0:
             raise ValueError(f"Empty file: {file_path}")
         return True
     def _generate_statistics(self, content: str, chunks: List[str]) -> Dict:
@@ -328,7 +629,7 @@ class DocumentProcessor:
     ) -> Dict[str, Dict]:
         """Process multiple documents in parallel"""
         results = {}
         if parallel:
             threads = []
             for file_path in file_paths:
@@ -338,13 +639,13 @@ class DocumentProcessor:
                 )
                 threads.append(thread)
                 thread.start()
             for thread in threads:
                 thread.join()
         else:
             for file_path in file_paths:
                 await self._process_and_store(file_path, results)
         return results
     async def _process_and_store(
@@ -357,4 +658,4 @@ class DocumentProcessor:
             result = await self.process_document(file_path)
             results[str(file_path)] = result
         except Exception as e:
-            results[str(file_path)] = {'error': str(e)}

 import json
 from pathlib import Path
 import hashlib
+import magic
 from bs4 import BeautifulSoup
 import csv
 from datetime import datetime
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 import logging
 from bs4.element import ProcessingInstruction
+from config.config import Settings
 from .enhanced_excel_processor import EnhancedExcelProcessor
 class DocumentProcessor:
     def __init__(
         self,
+        chunk_size: Optional[int] = None,
+        chunk_overlap: Optional[int] = None,
+        max_file_size: Optional[int] = None,
         supported_formats: Optional[List[str]] = None
     ):
+        """
+        Initialize DocumentProcessor with configurable parameters
+        Args:
+            chunk_size (Optional[int]): Size of text chunks
+            chunk_overlap (Optional[int]): Overlap between chunks
+            max_file_size (Optional[int]): Maximum file size in bytes
+            supported_formats (Optional[List[str]]): List of supported file extensions
+        """
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format='%(asctime)s - %(levelname)s - %(message)s'
+        )
+        # Get settings with validation
+        default_settings = Settings.get_document_processor_settings()
+        # Use provided values or defaults from settings
+        self.chunk_size = chunk_size if chunk_size is not None else default_settings[
+            'chunk_size']
+        self.chunk_overlap = chunk_overlap if chunk_overlap is not None else default_settings[
+            'chunk_overlap']
+        self.max_file_size = max_file_size if max_file_size is not None else default_settings[
+            'max_file_size']
+        self.supported_formats = supported_formats if supported_formats is not None else default_settings[
+            'supported_formats']
+        # Validate settings
+        self._validate_settings()
+        # Initialize existing components
         self.processing_queue = Queue()
         self.processed_docs = {}
         self._initialize_text_splitter()
         self.excel_processor = EnhancedExcelProcessor()
+        # Check for required packages (keep existing functionality)
         try:
             import striprtf.striprtf
         except ImportError:
+            logging.warning(
+                "Warning: striprtf package not found. RTF support will be limited.")
         try:
             from bs4 import BeautifulSoup
             import lxml
         except ImportError:
+            logging.warning(
+                "Warning: beautifulsoup4 or lxml package not found. XML support will be limited.")
+    def _validate_settings(self):
+        """Validate and adjust settings if necessary"""
+        # Ensure chunk_size is positive and reasonable
+        self.chunk_size = max(100, self.chunk_size)
+        # Ensure chunk_overlap is less than chunk_size
+        self.chunk_overlap = min(self.chunk_overlap, self.chunk_size - 50)
+        # Ensure max_file_size is reasonable (minimum 1MB)
+        self.max_file_size = max(1024 * 1024, self.max_file_size)
+        # Ensure supported_formats contains valid extensions
+        if not self.supported_formats:
+            # Fallback to default supported formats if empty
+            self.supported_formats = Settings.DOCUMENT_PROCESSOR['supported_formats']
+        # Ensure all formats start with a dot
+        self.supported_formats = [
+            f".{fmt.lower().lstrip('.')}" if not fmt.startswith(
+                '.') else fmt.lower()
+            for fmt in self.supported_formats
+        ]
     def _initialize_text_splitter(self):
         """Initialize the text splitter with custom settings"""
             chunk_size=self.chunk_size,
             chunk_overlap=self.chunk_overlap,
             length_function=len,
+            # Modify separators to better handle markdown while maintaining overlap
+            separators=["\n\n", "\n", " ", ""],
+            keep_separator=True,
+            add_start_index=True,
+            strip_whitespace=False  # Keep whitespace to maintain markdown formatting
         )
+    def split_text(self, text: str) -> List[str]:
+        """Split text with enforced overlap while preserving structure"""
+        try:
+            # Get initial split using RecursiveCharacterTextSplitter
+            initial_chunks = self.text_splitter.split_text(text)
+            if len(initial_chunks) <= 1:
+                return initial_chunks
+            # Process chunks with enforced overlap
+            final_chunks = []
+            for i, current_chunk in enumerate(initial_chunks):
+                if i == 0:
+                    final_chunks.append(current_chunk)
+                    continue
+                prev_chunk = final_chunks[-1]
+                # Get the last part of previous chunk for overlap
+                overlap_size = min(self.chunk_overlap, len(prev_chunk))
+                overlap_text = prev_chunk[-overlap_size:]
+                # For tables, include the header row
+                if '|' in current_chunk and '\n' in current_chunk:
+                    table_lines = current_chunk.split('\n')
+                    header_lines = []
+                    for line in table_lines:
+                        if line.strip().startswith('|'):
+                            header_lines.append(line)
+                        else:
+                            break
+                    if header_lines:
+                        header_text = '\n'.join(header_lines) + '\n'
+                        overlap_text = header_text + overlap_text
+                # Create new chunk with overlap
+                new_chunk = overlap_text + current_chunk
+                # Ensure we don't have duplicate content at the overlap point
+                if current_chunk.startswith(overlap_text):
+                    new_chunk = current_chunk
+                # Add context from previous chunk when needed
+                if not any(marker in new_chunk for marker in ['**AGENDA**', '**DISCUSSIONS**', '| No |']):
+                    context_markers = ['**AGENDA**',
+                                       '**DISCUSSIONS**', '| No |']
+                    for marker in context_markers:
+                        if marker in prev_chunk and marker not in new_chunk:
+                            new_chunk = marker + "\n" + new_chunk
+                            break
+                final_chunks.append(new_chunk)
+            # Validate and log overlaps
+            for i in range(len(final_chunks)-1):
+                actual_overlap = self._find_actual_overlap(
+                    final_chunks[i], final_chunks[i+1])
+                logging.debug(
+                    f"Overlap between chunks {i} and {i+1}: {len(actual_overlap)} characters")
+                if len(actual_overlap) < self.chunk_overlap:
+                    logging.warning(
+                        f"Insufficient overlap between chunks {i} and {i+1}")
+            return final_chunks
+            for start, end in table_sections:
+                # Process text before table if exists
+                if start > current_position:
+                    non_table_text = text[current_position:start]
+                    if non_table_text.strip():
+                        text_chunks = self.text_splitter.split_text(
+                            non_table_text)
+                        if chunks and text_chunks:
+                            # Ensure overlap with previous chunk
+                            prev_chunk = chunks[-1]
+                            overlap = self._get_overlap_text(prev_chunk)
+                            text_chunks[0] = overlap + text_chunks[0]
+                        chunks.extend(text_chunks)
+                # Process table as a single chunk with overlap
+                table_text = text[start:end]
+                if chunks:
+                    prev_chunk = chunks[-1]
+                    overlap = self._get_overlap_text(prev_chunk)
+                    table_text = overlap + table_text
+                chunks.append(table_text)
+                current_position = end
+            # Process remaining text after last table
+            if current_position < len(text):
+                remaining_text = text[current_position:]
+                if remaining_text.strip():
+                    text_chunks = self.text_splitter.split_text(remaining_text)
+                    if chunks and text_chunks:
+                        # Ensure overlap with previous chunk
+                        prev_chunk = chunks[-1]
+                        overlap = self._get_overlap_text(prev_chunk)
+                        text_chunks[0] = overlap + text_chunks[0]
+                    chunks.extend(text_chunks)
+            # Validate and adjust overlaps
+            chunks = self._ensure_minimum_overlap(chunks)
+            # Log chunk details for debugging
+            for i in range(len(chunks)-1):
+                overlap = self._find_actual_overlap(chunks[i], chunks[i+1])
+                logging.debug(
+                    f"Overlap between chunks {i} and {i+1}: {len(overlap)} characters")
+                logging.debug(f"End of chunk {i}: {chunks[i][-50:]}")
+                logging.debug(f"Start of chunk {i+1}: {chunks[i+1][:50]}")
+            return chunks
+        except Exception as e:
+            logging.error(f"Error in split_text: {str(e)}")
+            # Fallback to original text splitter
+            return self.text_splitter.split_text(text)
+    def _find_break_point(self, text: str, prev_chunk: str) -> int:
+        """
+    Find suitable breaking point that maintains document structure
+    Args:
+        text (str): Text to find break point in (the overlap portion)
+        prev_chunk (str): The complete previous chunk for context
+    Returns:
+        int: Position of suitable break point
+    """
+        # Get the context of how the previous chunk ends
+        prev_chunk_lines = prev_chunk.split('\n')
+        # Special handling for markdown tables
+        if '|' in prev_chunk:
+            # Check if we're in the middle of a table
+            table_rows = [
+                line for line in prev_chunk_lines if line.strip().startswith('|')]
+            if table_rows:
+                # Find where the current table starts in the text
+                table_start = text.find('|')
+                if table_start >= 0:
+                    # Find the next row boundary
+                    next_row = text.find('\n', table_start)
+                    if next_row >= 0:
+                        return next_row + 1  # Include the newline
+        # Define break point markers in order of preference
+        break_markers = [
+            ('\n\n', True),   # Paragraph breaks (keep marker)
+            ('\n', True),     # Line breaks (keep marker)
+            ('. ', True),     # Sentence endings (keep marker)
+            (', ', True),     # Clause breaks (keep marker)
+            (' ', False)      # Word breaks (don't keep marker)
+        ]
+        # Check the structure of the previous chunk end
+        last_line = prev_chunk_lines[-1] if prev_chunk_lines else ""
+        # Look for each type of break point
+        for marker, keep_marker in break_markers:
+            if marker in text:
+                # Try to find a break point that maintains document structure
+                marker_positions = [i for i in range(
+                    len(text)) if text[i:i+len(marker)] == marker]
+                for pos in reversed(marker_positions):
+                    # Check if this break point would maintain document structure
+                    if self._is_valid_break_point(text, pos, last_line):
+                        return pos + (len(marker) if keep_marker else 0)
+        # If no suitable break point found, default to exact position
+        return min(len(text), self.chunk_overlap)
+    def _is_valid_break_point(self, text: str, position: int, last_line: str) -> bool:
+        """
+    Check if a break point would maintain document structure
+    Args:
+        text (str): Text being checked
+        position (int): Potential break position
+        last_line (str): Last line of previous chunk
+    Returns:
+        bool: True if break point is valid
+    """
+        # Don't break in the middle of markdown formatting
+        markdown_markers = ['*', '_', '`', '[', ']', '(', ')', '#']
+        if position > 0 and position < len(text) - 1:
+            if text[position-1] in markdown_markers or text[position+1] in markdown_markers:
+                return False
+        # Don't break in the middle of a table cell
+        if '|' in last_line:
+            cell_count = last_line.count('|')
+            text_before_break = text[:position]
+            if text_before_break.count('|') % cell_count != 0:
+                return False
+        # Don't break URLs or code blocks
+        url_patterns = ['http://', 'https://', '```', '`']
+        for pattern in url_patterns:
+            if pattern in text[:position] and pattern not in text[position:]:
+                return False
+        return True
+    def _validate_chunks(self, original_text: str, chunks: List[str]) -> bool:
+        """Validate that chunks maintain document integrity"""
+        try:
+            # Remove overlap to check content
+            reconstructed = chunks[0]
+            for chunk in chunks[1:]:
+                if len(chunk) > self.chunk_overlap:
+                    reconstructed += chunk[self.chunk_overlap:]
+            # Clean both texts for comparison (remove extra whitespace)
+            clean_original = ' '.join(original_text.split())
+            clean_reconstructed = ' '.join(reconstructed.split())
+            return clean_original == clean_reconstructed
+        except Exception as e:
+            logging.error(f"Error validating chunks: {str(e)}")
+            return False
     def _extract_content(self, file_path: Path) -> str:
         """Extract content from different file formats"""
         suffix = file_path.suffix.lower()
         try:
             if suffix == '.pdf':
                 return self._extract_pdf(file_path)
             else:
                 raise ValueError(f"Unsupported format: {suffix}")
         except Exception as e:
+            raise Exception(
+                f"Error extracting content from {file_path}: {str(e)}")
     def _extract_text(self, file_path: Path) -> str:
         """Extract content from text-based files"""
         with open(file_path, 'rb') as file:
             reader = PyPDF2.PdfReader(file)
             metadata = reader.metadata
             for page in reader.pages:
                 text += page.extract_text() + "\n\n"
                 # Extract images if available
                 if '/XObject' in page['/Resources']:
                     for obj in page['/Resources']['/XObject'].get_object():
                         if page['/Resources']['/XObject'][obj]['/Subtype'] == '/Image':
                             pass
         return text.strip()
     def _extract_docx(self, file_path: Path) -> str:
         """Extract text from DOCX with formatting"""
         doc = docx.Document(file_path)
         full_text = []
         for para in doc.paragraphs:
             full_text.append(para.text)
         for table in doc.tables:
             for row in table.rows:
                 row_text = [cell.text for cell in row.cells]
                 full_text.append(" | ".join(row_text))
         return "\n\n".join(full_text)
     def _extract_csv(self, file_path: Path) -> str:
         """Extract text from HTML with structure preservation"""
         with open(file_path) as f:
             soup = BeautifulSoup(f, 'html.parser')
         for script in soup(["script", "style"]):
             script.decompose()
         text = soup.get_text(separator='\n')
         lines = [line.strip() for line in text.splitlines() if line.strip()]
         return "\n\n".join(lines)
         try:
             with open(file_path, 'r', encoding='utf-8') as f:
                 soup = BeautifulSoup(f, 'xml')
             for pi in soup.find_all(text=lambda text: isinstance(text, ProcessingInstruction)):
                 pi.extract()
             text = soup.get_text(separator='\n')
+            lines = [line.strip()
+                     for line in text.splitlines() if line.strip()]
             return "\n\n".join(lines)
         except Exception as e:
             raise Exception(f"Error processing XML file: {str(e)}")
         """Extract text from RTF files"""
         try:
             import striprtf.striprtf as striprtf
             with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                 rtf_text = f.read()
             plain_text = striprtf.rtf_to_text(rtf_text)
+            lines = [line.strip()
+                     for line in plain_text.splitlines() if line.strip()]
             return "\n\n".join(lines)
         except ImportError:
             raise ImportError("striprtf package is required for RTF support.")
         try:
             # Use enhanced Excel processor
             processed_content = self.excel_processor.process_excel(file_path)
             # If processing fails, fall back to basic processing
             if not processed_content:
+                logging.warning(
+                    f"Enhanced Excel processing failed for {file_path}, falling back to basic processing")
                 return self._basic_excel_extract(file_path)
             return processed_content
         except Exception as e:
             logging.error(f"Error in enhanced Excel processing: {str(e)}")
             # Fall back to basic Excel processing
         try:
             excel_file = pd.ExcelFile(file_path)
             sheets_data = []
             for sheet_name in excel_file.sheet_names:
                 df = pd.read_excel(excel_file, sheet_name=sheet_name)
                 sheet_content = f"\nSheet: {sheet_name}\n"
                 sheet_content += "=" * (len(sheet_name) + 7) + "\n"
                 if df.empty:
                     sheet_content += "Empty Sheet\n"
                 else:
                         max_cols=None,
                         line_width=120
                     ) + "\n"
                 sheets_data.append(sheet_content)
             return "\n\n".join(sheets_data)
         except Exception as e:
             raise Exception(f"Error in basic Excel processing: {str(e)}")
     ) -> Dict:
         """Generate comprehensive metadata"""
         file_stat = file_path.stat()
         metadata = {
             'filename': file_path.name,
             'file_type': file_path.suffix,
             'character_count': len(content),
             'processing_timestamp': datetime.now().isoformat()
         }
         # Add Excel-specific metadata if applicable
         if file_path.suffix.lower() in ['.xlsx', '.xls']:
             try:
                     metadata.update({'excel_metadata': excel_metadata})
             except Exception as e:
                 logging.warning(f"Could not extract Excel metadata: {str(e)}")
         if additional_metadata:
             metadata.update(additional_metadata)
         return metadata
     def _calculate_hash(self, text: str) -> str:
         """Calculate SHA-256 hash of text"""
         return hashlib.sha256(text.encode()).hexdigest()
+    async def process_document(self, file_path: Union[str, Path], metadata: Optional[Dict] = None) -> Dict:
         """Process a document with metadata and content extraction"""
         file_path = Path(file_path)
         if not self._validate_file(file_path):
             raise ValueError(f"Invalid file: {file_path}")
         content = self._extract_content(file_path)
         doc_metadata = self._generate_metadata(file_path, content, metadata)
+        # Try enhanced splitting with validation
+        chunks = self.split_text(content)
+        if not self._validate_chunks(content, chunks):
+            logging.warning(
+                "Enhanced splitting failed validation, falling back to original splitter")
+            chunks = self.text_splitter.split_text(content)
+        # Add logging to verify chunk overlap
+        for i in range(len(chunks)-1):
+            logging.debug(f"Chunk {i} ends with: {chunks[i][-50:]}")
+            logging.debug(f"Chunk {i+1} starts with: {chunks[i+1][:50]}")
+            logging.debug(
+                f"Overlap size: {self._calculate_overlap_size(chunks[i], chunks[i+1])} characters")
         chunk_hashes = [self._calculate_hash(chunk) for chunk in chunks]
         return {
             'content': content,
             'chunks': chunks,
             'statistics': self._generate_statistics(content, chunks)
         }
+    def _calculate_overlap_size(self, chunk1: str, chunk2: str) -> int:
+        """Calculate the size of overlap between two chunks"""
+        min_len = min(len(chunk1), len(chunk2))
+        for i in range(min_len, 0, -1):
+            if chunk1[-i:] == chunk2[:i]:
+                return i
+        return 0
     def _validate_file(self, file_path: Path) -> bool:
         """Validate file type, size, and content"""
         if not file_path.exists():
             raise FileNotFoundError(f"File not found: {file_path}")
         if file_path.suffix.lower() not in self.supported_formats:
             raise ValueError(f"Unsupported file format: {file_path.suffix}")
         if file_path.stat().st_size > self.max_file_size:
             raise ValueError(f"File too large: {file_path}")
         if file_path.stat().st_size == 0:
             raise ValueError(f"Empty file: {file_path}")
         return True
     def _generate_statistics(self, content: str, chunks: List[str]) -> Dict:
     ) -> Dict[str, Dict]:
         """Process multiple documents in parallel"""
         results = {}
         if parallel:
             threads = []
             for file_path in file_paths:
                 )
                 threads.append(thread)
                 thread.start()
             for thread in threads:
                 thread.join()
         else:
             for file_path in file_paths:
                 await self._process_and_store(file_path, results)
         return results
     async def _process_and_store(
             result = await self.process_document(file_path)
             results[str(file_path)] = result
         except Exception as e:
+            results[str(file_path)] = {'error': str(e)}

src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc CHANGED Viewed

Binary files a/src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc and b/src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc differ

src/vectorstores/chroma_vectorstore.py CHANGED Viewed

@@ -6,17 +6,18 @@ import logging
 from .base_vectorstore import BaseVectorStore
 class ChromaVectorStore(BaseVectorStore):
     def __init__(
-        self,
-        embedding_function: Callable[[List[str]], List[List[float]]],
         persist_directory: str = './chroma_db',
         collection_name: str = "documents",
         client_settings: Optional[Dict[str, Any]] = None
     ):
         """
         Initialize Chroma Vector Store
         Args:
             embedding_function (Callable): Function to generate embeddings
             persist_directory (str): Directory to persist the vector store
@@ -31,23 +32,24 @@ class ChromaVectorStore(BaseVectorStore):
             self.client = chromadb.PersistentClient(settings=settings)
             self.collection = self.client.get_or_create_collection(
                 name=collection_name,
-                metadata={"hnsw:space": "cosine"}  # Using cosine similarity by default
             )
             self.embedding_function = embedding_function
         except Exception as e:
             logging.error(f"Error initializing ChromaDB: {str(e)}")
             raise
     def add_documents(
-        self,
-        documents: List[str],
         embeddings: Optional[List[List[float]]] = None,
         metadatas: Optional[List[Dict[str, Any]]] = None,
         ids: Optional[List[str]] = None
     ) -> None:
         """
         Add documents to the vector store
         Args:
             documents (List[str]): List of document texts
             embeddings (Optional[List[List[float]]]): Pre-computed embeddings
@@ -63,32 +65,35 @@ class ChromaVectorStore(BaseVectorStore):
                 embeddings = self.embedding_function(documents)
             if len(documents) != len(embeddings):
-                raise ValueError("Number of documents and embeddings must match")
             # Use provided IDs or generate them
-            doc_ids = ids if ids is not None else [f"doc_{i}" for i in range(len(documents))]
             # Prepare add parameters
             add_params = {
                 "documents": documents,
                 "embeddings": embeddings,
                 "ids": doc_ids
             }
             # Only include metadatas if provided
             if metadatas is not None:
                 if len(metadatas) != len(documents):
-                    raise ValueError("Number of documents and metadatas must match")
                 add_params["metadatas"] = metadatas
             self.collection.add(**add_params)
         except Exception as e:
             logging.error(f"Error adding documents to ChromaDB: {str(e)}")
             raise
     def similarity_search(
-        self,
-        query_embedding: List[float],
         top_k: int = 3,
         **kwargs
     ) -> List[Dict[str, Any]]:
@@ -102,21 +107,24 @@ class ChromaVectorStore(BaseVectorStore):
                 n_results=10,  # Get more initial results
                 include=['documents', 'metadatas', 'distances']
             )
             if not results or 'documents' not in results or not results['documents']:
                 logging.warning("No results found in similarity search")
                 return []
             formatted_results = []
             documents = results['documents'][0]  # First query's results
-            metadatas = results['metadatas'][0] if results.get('metadatas') else [None] * len(documents)
-            distances = results['distances'][0] if results.get('distances') else [None] * len(documents)
             # Process all results
             for doc, meta, dist in zip(documents, metadatas, distances):
                 # Convert distance to similarity score (1 is most similar, 0 is least)
-                similarity_score = 1.0 - (dist or 0.0) if dist is not None else None
                 # More permissive threshold and include all results for filtering
                 if similarity_score is not None and similarity_score > 0.2:  # Lower threshold
                     formatted_results.append({
@@ -124,45 +132,47 @@ class ChromaVectorStore(BaseVectorStore):
                         'metadata': meta or {},
                         'score': similarity_score
                     })
             # Sort by score and get top_k results
             formatted_results.sort(key=lambda x: x['score'] or 0, reverse=True)
             # Check if results are from same document and get consecutive chunks
             if formatted_results:
-                first_doc_id = formatted_results[0]['metadata'].get('document_id')
                 all_chunks_same_doc = []
                 # Get all chunks from the same document
                 for result in formatted_results:
                     if result['metadata'].get('document_id') == first_doc_id:
                         all_chunks_same_doc.append(result)
                 # Sort chunks by their index to maintain document flow
                 all_chunks_same_doc.sort(
                     key=lambda x: x['metadata'].get('chunk_index', 0)
                 )
                 # Return either all chunks from same document or top_k results
                 if len(all_chunks_same_doc) > 0:
                     return all_chunks_same_doc[:top_k]
             return formatted_results[:top_k]
         except Exception as e:
-            logging.error(f"Error performing similarity search in ChromaDB: {str(e)}")
             raise
     def get_all_documents(
         self,
         include_embeddings: bool = False
     ) -> List[Dict[str, Any]]:
         """
         Retrieve all documents from the vector store
         Args:
             include_embeddings (bool): Whether to include embeddings in the response
         Returns:
             List[Dict[str, Any]]: List of documents with their IDs and optionally embeddings
         """
@@ -170,45 +180,46 @@ class ChromaVectorStore(BaseVectorStore):
             include = ["documents", "metadatas"]
             if include_embeddings:
                 include.append("embeddings")
             results = self.collection.get(
                 include=include
             )
             if not results or 'documents' not in results:
                 return []
             documents = []
             for i in range(len(results['documents'])):
                 doc = {
                     'id': str(i),  # Generate sequential IDs
                     'text': results['documents'][i],
                 }
                 if include_embeddings and 'embeddings' in results:
                     doc['embedding'] = results['embeddings'][i]
                 if 'metadatas' in results and results['metadatas'][i]:
                     doc['metadata'] = results['metadatas'][i]
                     # Use document_id from metadata if available
                     if 'document_id' in results['metadatas'][i]:
                         doc['id'] = results['metadatas'][i]['document_id']
                 documents.append(doc)
             return documents
         except Exception as e:
-            logging.error(f"Error retrieving documents from ChromaDB: {str(e)}")
             raise
     def get_document_chunks(self, document_id: str) -> List[Dict[str, Any]]:
         """
         Retrieve all chunks for a specific document
         Args:
             document_id (str): ID of the document to retrieve chunks for
         Returns:
             List[Dict[str, Any]]: List of document chunks with their metadata
         """
@@ -217,10 +228,10 @@ class ChromaVectorStore(BaseVectorStore):
                 where={"document_id": document_id},
                 include=["documents", "metadatas"]
             )
             if not results or 'documents' not in results:
                 return []
             chunks = []
             for i in range(len(results['documents'])):
                 chunk = {
@@ -228,10 +239,11 @@ class ChromaVectorStore(BaseVectorStore):
                     'metadata': results['metadatas'][i] if results.get('metadatas') else None
                 }
                 chunks.append(chunk)
             # Sort by chunk_index if available
-            chunks.sort(key=lambda x: x.get('metadata', {}).get('chunk_index', 0))
             return chunks
         except Exception as e:
             logging.error(f"Error retrieving document chunks: {str(e)}")
@@ -240,7 +252,7 @@ class ChromaVectorStore(BaseVectorStore):
     def delete_document(self, document_id: str) -> None:
         """
         Delete all chunks associated with a document_id
         Args:
             document_id (str): ID of the document to delete
         """
@@ -250,15 +262,17 @@ class ChromaVectorStore(BaseVectorStore):
                 where={"document_id": document_id},
                 include=["metadatas"]
             )
             if not results or 'ids' not in results:
                 logging.warning(f"No document found with ID: {document_id}")
                 return
             # Delete all chunks associated with the document
-            chunk_ids = [f"{document_id}-chunk-{i}" for i in range(len(results['metadatas']))]
             self.collection.delete(ids=chunk_ids)
         except Exception as e:
-            logging.error(f"Error deleting document {document_id} from ChromaDB: {str(e)}")
-            raise

 from .base_vectorstore import BaseVectorStore
 class ChromaVectorStore(BaseVectorStore):
     def __init__(
+        self,
+        embedding_function: Callable[[List[str]], List[List[float]]],
         persist_directory: str = './chroma_db',
         collection_name: str = "documents",
         client_settings: Optional[Dict[str, Any]] = None
     ):
         """
         Initialize Chroma Vector Store
         Args:
             embedding_function (Callable): Function to generate embeddings
             persist_directory (str): Directory to persist the vector store
             self.client = chromadb.PersistentClient(settings=settings)
             self.collection = self.client.get_or_create_collection(
                 name=collection_name,
+                # Using cosine similarity by default
+                metadata={"hnsw:space": "cosine"}
             )
             self.embedding_function = embedding_function
         except Exception as e:
             logging.error(f"Error initializing ChromaDB: {str(e)}")
             raise
     def add_documents(
+        self,
+        documents: List[str],
         embeddings: Optional[List[List[float]]] = None,
         metadatas: Optional[List[Dict[str, Any]]] = None,
         ids: Optional[List[str]] = None
     ) -> None:
         """
         Add documents to the vector store
         Args:
             documents (List[str]): List of document texts
             embeddings (Optional[List[List[float]]]): Pre-computed embeddings
                 embeddings = self.embedding_function(documents)
             if len(documents) != len(embeddings):
+                raise ValueError(
+                    "Number of documents and embeddings must match")
             # Use provided IDs or generate them
+            doc_ids = ids if ids is not None else [
+                f"doc_{i}" for i in range(len(documents))]
             # Prepare add parameters
             add_params = {
                 "documents": documents,
                 "embeddings": embeddings,
                 "ids": doc_ids
             }
             # Only include metadatas if provided
             if metadatas is not None:
                 if len(metadatas) != len(documents):
+                    raise ValueError(
+                        "Number of documents and metadatas must match")
                 add_params["metadatas"] = metadatas
             self.collection.add(**add_params)
         except Exception as e:
             logging.error(f"Error adding documents to ChromaDB: {str(e)}")
             raise
     def similarity_search(
+        self,
+        query_embedding: List[float],
         top_k: int = 3,
         **kwargs
     ) -> List[Dict[str, Any]]:
                 n_results=10,  # Get more initial results
                 include=['documents', 'metadatas', 'distances']
             )
             if not results or 'documents' not in results or not results['documents']:
                 logging.warning("No results found in similarity search")
                 return []
             formatted_results = []
             documents = results['documents'][0]  # First query's results
+            metadatas = results['metadatas'][0] if results.get('metadatas') else [
+                None] * len(documents)
+            distances = results['distances'][0] if results.get('distances') else [
+                None] * len(documents)
             # Process all results
             for doc, meta, dist in zip(documents, metadatas, distances):
                 # Convert distance to similarity score (1 is most similar, 0 is least)
+                similarity_score = 1.0 - \
+                    (dist or 0.0) if dist is not None else None
                 # More permissive threshold and include all results for filtering
                 if similarity_score is not None and similarity_score > 0.2:  # Lower threshold
                     formatted_results.append({
                         'metadata': meta or {},
                         'score': similarity_score
                     })
             # Sort by score and get top_k results
             formatted_results.sort(key=lambda x: x['score'] or 0, reverse=True)
             # Check if results are from same document and get consecutive chunks
             if formatted_results:
+                first_doc_id = formatted_results[0]['metadata'].get(
+                    'document_id')
                 all_chunks_same_doc = []
                 # Get all chunks from the same document
                 for result in formatted_results:
                     if result['metadata'].get('document_id') == first_doc_id:
                         all_chunks_same_doc.append(result)
                 # Sort chunks by their index to maintain document flow
                 all_chunks_same_doc.sort(
                     key=lambda x: x['metadata'].get('chunk_index', 0)
                 )
                 # Return either all chunks from same document or top_k results
                 if len(all_chunks_same_doc) > 0:
                     return all_chunks_same_doc[:top_k]
             return formatted_results[:top_k]
         except Exception as e:
+            logging.error(
+                f"Error performing similarity search in ChromaDB: {str(e)}")
             raise
     def get_all_documents(
         self,
         include_embeddings: bool = False
     ) -> List[Dict[str, Any]]:
         """
         Retrieve all documents from the vector store
         Args:
             include_embeddings (bool): Whether to include embeddings in the response
         Returns:
             List[Dict[str, Any]]: List of documents with their IDs and optionally embeddings
         """
             include = ["documents", "metadatas"]
             if include_embeddings:
                 include.append("embeddings")
             results = self.collection.get(
                 include=include
             )
             if not results or 'documents' not in results:
                 return []
             documents = []
             for i in range(len(results['documents'])):
                 doc = {
                     'id': str(i),  # Generate sequential IDs
                     'text': results['documents'][i],
                 }
                 if include_embeddings and 'embeddings' in results:
                     doc['embedding'] = results['embeddings'][i]
                 if 'metadatas' in results and results['metadatas'][i]:
                     doc['metadata'] = results['metadatas'][i]
                     # Use document_id from metadata if available
                     if 'document_id' in results['metadatas'][i]:
                         doc['id'] = results['metadatas'][i]['document_id']
                 documents.append(doc)
             return documents
         except Exception as e:
+            logging.error(
+                f"Error retrieving documents from ChromaDB: {str(e)}")
             raise
     def get_document_chunks(self, document_id: str) -> List[Dict[str, Any]]:
         """
         Retrieve all chunks for a specific document
         Args:
             document_id (str): ID of the document to retrieve chunks for
         Returns:
             List[Dict[str, Any]]: List of document chunks with their metadata
         """
                 where={"document_id": document_id},
                 include=["documents", "metadatas"]
             )
             if not results or 'documents' not in results:
                 return []
             chunks = []
             for i in range(len(results['documents'])):
                 chunk = {
                     'metadata': results['metadatas'][i] if results.get('metadatas') else None
                 }
                 chunks.append(chunk)
             # Sort by chunk_index if available
+            chunks.sort(key=lambda x: x.get(
+                'metadata', {}).get('chunk_index', 0))
             return chunks
         except Exception as e:
             logging.error(f"Error retrieving document chunks: {str(e)}")
     def delete_document(self, document_id: str) -> None:
         """
         Delete all chunks associated with a document_id
         Args:
             document_id (str): ID of the document to delete
         """
                 where={"document_id": document_id},
                 include=["metadatas"]
             )
             if not results or 'ids' not in results:
                 logging.warning(f"No document found with ID: {document_id}")
                 return
             # Delete all chunks associated with the document
+            chunk_ids = [
+                f"{document_id}-chunk-{i}" for i in range(len(results['metadatas']))]
             self.collection.delete(ids=chunk_ids)
         except Exception as e:
+            logging.error(
+                f"Error deleting document {document_id} from ChromaDB: {str(e)}")
+            raise

temp_downloads/1K608-Qr03M6nf5FhB6AajbHm8kjQujx1.xlsx CHANGED Viewed

Binary files a/temp_downloads/1K608-Qr03M6nf5FhB6AajbHm8kjQujx1.xlsx and b/temp_downloads/1K608-Qr03M6nf5FhB6AajbHm8kjQujx1.xlsx differ