Spaces:

TalatMasud
/

chatbot-backend

Running

App Files Files Community

TalatMasood commited on Feb 10

Commit

b953016

1 Parent(s): 9700f95

Enhanced the support for the excel file and added endpoint to have optimized vector store and Rag for the Excel.

Browse files

Files changed (23) hide show

config/__pycache__/config.cpython-312.pyc +0 -0
config/config.py +2 -0
src/__pycache__/main.cpython-312.pyc +0 -0
src/agents/__pycache__/excel_aware_rag.cpython-312.pyc +0 -0
src/agents/__pycache__/rag_agent.cpython-312.pyc +0 -0
src/agents/excel_aware_rag.py +237 -0
src/agents/rag_agent.py +25 -2
src/db/__pycache__/mongodb_store.cpython-312.pyc +0 -0
src/db/mongodb_store.py +50 -3
src/main.py +173 -14
src/models/UserContact.py +28 -0
src/models/__pycache__/UserContact.cpython-312.pyc +0 -0
src/utils/__pycache__/database_cleanup.cpython-312.pyc +0 -0
src/utils/__pycache__/document_processor.cpython-312.pyc +0 -0
src/utils/__pycache__/enhanced_excel_processor.cpython-312.pyc +0 -0
src/utils/__pycache__/llm_utils.cpython-312.pyc +0 -0
src/utils/database_cleanup.py +182 -0
src/utils/document_processor.py +169 -71
src/utils/enhanced_excel_processor.py +187 -0
src/utils/excel_integration +139 -0
src/utils/llm_utils.py +11 -9
src/vectorstores/__pycache__/optimized_vectorstore.cpython-312.pyc +0 -0
src/vectorstores/optimized_vectorstore.py +137 -0

config/__pycache__/config.cpython-312.pyc CHANGED Viewed

Binary files a/config/__pycache__/config.cpython-312.pyc and b/config/__pycache__/config.cpython-312.pyc differ

config/config.py CHANGED Viewed

@@ -10,6 +10,8 @@ class Settings:
     OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', '')
     OPENAI_MODEL = os.getenv('OPENAI_MODEL', 'gpt-3.5-turbo')
     # Ollama Configuration
     OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')
     OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'llama2')

     OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', '')
     OPENAI_MODEL = os.getenv('OPENAI_MODEL', 'gpt-3.5-turbo')
+    ADMIN_API_KEY = 'aca4081f-6ff2-434c-843b-98f60285c499'
     # Ollama Configuration
     OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')
     OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'llama2')

src/__pycache__/main.cpython-312.pyc CHANGED Viewed

Binary files a/src/__pycache__/main.cpython-312.pyc and b/src/__pycache__/main.cpython-312.pyc differ

src/agents/__pycache__/excel_aware_rag.cpython-312.pyc ADDED Viewed

Binary file (10.1 kB). View file

src/agents/__pycache__/rag_agent.cpython-312.pyc CHANGED Viewed

Binary files a/src/agents/__pycache__/rag_agent.cpython-312.pyc and b/src/agents/__pycache__/rag_agent.cpython-312.pyc differ

src/agents/excel_aware_rag.py ADDED Viewed

	@@ -0,0 +1,237 @@

+# src/agents/excel_aware_rag.py
+from typing import List, Dict, Optional, Set
+from src.utils.logger import logger
+class ExcelAwareRAGAgent:
+    """Extension of RAGAgent with enhanced Excel handling"""
+    def _process_excel_context(self, context_docs: List[str], query: str) -> List[str]:
+        """
+        Process and enhance context for Excel-related queries
+        Args:
+            context_docs (List[str]): Original context documents
+            query (str): User query
+        Returns:
+            List[str]: Enhanced context documents
+        """
+        excel_context = []
+        for doc in context_docs:
+            if 'Sheet:' in doc:  # Identify Excel content
+                # Extract relevant sheet context based on query
+                relevant_sheets = self._identify_relevant_sheets(doc, query)
+                for sheet in relevant_sheets:
+                    sheet_context = self._extract_sheet_context(doc, sheet)
+                    if sheet_context:
+                        excel_context.append(sheet_context)
+                # Add relationship context if query suggests multi-sheet analysis
+                if self._needs_relationship_context(query):
+                    relationship_context = self._extract_relationship_context(doc)
+                    if relationship_context:
+                        excel_context.append(relationship_context)
+            else:
+                excel_context.append(doc)
+        return excel_context
+    def _identify_relevant_sheets(self, doc: str, query: str) -> List[str]:
+        """Identify sheets relevant to the query"""
+        sheets = []
+        current_sheet = None
+        # Extract sheet names from the document
+        for line in doc.split('\n'):
+            if line.startswith('Sheet: '):
+                current_sheet = line.replace('Sheet: ', '').strip()
+                # Check if sheet name or its contents are relevant to query
+                if self._is_relevant_to_query(current_sheet, query):
+                    sheets.append(current_sheet)
+        return sheets
+    def _is_relevant_to_query(self, sheet_name: str, query: str) -> bool:
+        """Check if a sheet is relevant to the query"""
+        # Convert to lower case for comparison
+        query_lower = query.lower()
+        sheet_lower = sheet_name.lower()
+        # Direct mention of sheet name
+        if sheet_lower in query_lower:
+            return True
+        # Check for related terms
+        sheet_terms = set(sheet_lower.split())
+        query_terms = set(query_lower.split())
+        # If there's significant term overlap
+        common_terms = sheet_terms.intersection(query_terms)
+        if len(common_terms) > 0:
+            return True
+        return False
+    def _extract_sheet_context(self, doc: str, sheet_name: str) -> Optional[str]:
+        """Extract context for a specific sheet"""
+        lines = doc.split('\n')
+        sheet_context = []
+        in_target_sheet = False
+        for line in lines:
+            if line.startswith(f'Sheet: {sheet_name}'):
+                in_target_sheet = True
+                sheet_context.append(line)
+            elif line.startswith('Sheet: '):
+                in_target_sheet = False
+            elif in_target_sheet:
+                sheet_context.append(line)
+        return '\n'.join(sheet_context) if sheet_context else None
+    def _needs_relationship_context(self, query: str) -> bool:
+        """Determine if query needs relationship context"""
+        relationship_indicators = [
+            'compare', 'relationship', 'between', 'across', 'correlation',
+            'related', 'connection', 'link', 'join', 'combine', 'multiple sheets',
+            'all sheets', 'different sheets'
+        ]
+        query_lower = query.lower()
+        return any(indicator in query_lower for indicator in relationship_indicators)
+    def _extract_relationship_context(self, doc: str) -> Optional[str]:
+        """Extract relationship context from document"""
+        lines = doc.split('\n')
+        relationship_context = []
+        in_relationships = False
+        for line in lines:
+            if 'Sheet Relationships:' in line:
+                in_relationships = True
+                relationship_context.append(line)
+            elif in_relationships and line.strip() and not line.startswith('Sheet: '):
+                relationship_context.append(line)
+            elif in_relationships and line.startswith('Sheet: '):
+                break
+        return '\n'.join(relationship_context) if relationship_context else None
+    async def enhance_excel_response(
+        self,
+        query: str,
+        response: str,
+        context_docs: List[str]
+    ) -> str:
+        """
+        Enhance response for Excel-related queries
+        Args:
+            query (str): Original query
+            response (str): Generated response
+            context_docs (List[str]): Context documents
+        Returns:
+            str: Enhanced response
+        """
+        if not any('Sheet:' in doc for doc in context_docs):
+            return response
+        try:
+            # Enhance response with specific Excel insights
+            enhanced_parts = [response]
+            # Add sheet-specific insights if relevant
+            if self._needs_sheet_specific_insights(query):
+                insights = self._generate_sheet_insights(query, context_docs)
+                if insights:
+                    enhanced_parts.append("\nAdditional Sheet Insights:")
+                    enhanced_parts.extend(insights)
+            # Add relationship insights if relevant
+            if self._needs_relationship_context(query):
+                relationship_insights = self._generate_relationship_insights(context_docs)
+                if relationship_insights:
+                    enhanced_parts.append("\nSheet Relationship Insights:")
+                    enhanced_parts.extend(relationship_insights)
+            return "\n".join(enhanced_parts)
+        except Exception as e:
+            logger.error(f"Error enhancing Excel response: {str(e)}")
+            return response  # Fall back to original response if enhancement fails
+    def _needs_sheet_specific_insights(self, query: str) -> bool:
+        """Determine if query needs sheet-specific insights"""
+        insight_indicators = [
+            'analyze', 'summarize', 'tell me about', 'what is in',
+            'show me', 'describe', 'explain', 'give me details'
+        ]
+        query_lower = query.lower()
+        return any(indicator in query_lower for indicator in insight_indicators)
+    def _generate_sheet_insights(self, query: str, context_docs: List[str]) -> List[str]:
+        """Generate insights for relevant sheets"""
+        insights = []
+        relevant_sheets = set()
+        # Collect relevant sheets from context
+        for doc in context_docs:
+            if 'Sheet:' in doc:
+                sheets = self._identify_relevant_sheets(doc, query)
+                relevant_sheets.update(sheets)
+        # Generate insights for each relevant sheet
+        for sheet in relevant_sheets:
+            sheet_insights = self._generate_single_sheet_insights(sheet, context_docs)
+            if sheet_insights:
+                insights.extend(sheet_insights)
+        return insights
+    def _generate_single_sheet_insights(
+        self,
+        sheet_name: str,
+        context_docs: List[str]
+    ) -> List[str]:
+        """Generate insights for a single sheet"""
+        insights = []
+        sheet_context = None
+        # Find context for this sheet
+        for doc in context_docs:
+            if f'Sheet: {sheet_name}' in doc:
+                sheet_context = self._extract_sheet_context(doc, sheet_name)
+                break
+        if not sheet_context:
+            return insights
+        # Extract and summarize key information
+        if 'Numeric Columns Summary:' in sheet_context:
+            numeric_insights = self._extract_numeric_insights(sheet_context)
+            if numeric_insights:
+                insights.extend(numeric_insights)
+        if 'Categorical Columns Summary:' in sheet_context:
+            categorical_insights = self._extract_categorical_insights(sheet_context)
+            if categorical_insights:
+                insights.extend(categorical_insights)
+        return insights
+    def _generate_relationship_insights(self, context_docs: List[str]) -> List[str]:
+        """Generate insights about relationships between sheets"""
+        insights = []
+        for doc in context_docs:
+            relationship_context = self._extract_relationship_context(doc)
+            if relationship_context:
+                # Process and format relationship information
+                relationships = relationship_context.split('\n')[1:]  # Skip header
+                for rel in relationships:
+                    if rel.strip():
+                        insights.append(f"- {rel.strip()}")
+        return insights

src/agents/rag_agent.py CHANGED Viewed

@@ -2,6 +2,7 @@
 from typing import List, Optional, Tuple, Dict
 import uuid
 from ..llms.base_llm import BaseLLM
 from src.embeddings.base_embedding import BaseEmbedding
 from src.vectorstores.base_vectorstore import BaseVectorStore
@@ -10,7 +11,7 @@ from src.db.mongodb_store import MongoDBStore
 from src.models.rag import RAGResponse
 from src.utils.logger import logger
-class RAGAgent:
     def __init__(
         self,
         llm: BaseLLM,
@@ -31,6 +32,7 @@ class RAGAgent:
             max_history_tokens (int): Maximum tokens in conversation history
             max_history_messages (int): Maximum messages to keep in history
         """
         self.llm = llm
         self.embedding = embedding
         self.vector_store = vector_store
@@ -77,6 +79,15 @@ class RAGAgent:
                 sources = None
                 scores = None
             # Generate prompt with context and history
             augmented_prompt = self.conversation_manager.generate_prompt_with_history(
                 current_query=query,
@@ -84,13 +95,25 @@ class RAGAgent:
                 context_docs=context_docs
             )
-            # Generate response using LLM
             response = self.llm.generate(
                 augmented_prompt,
                 temperature=temperature,
                 max_tokens=max_tokens
             )
             return RAGResponse(
                 response=response,
                 context_docs=context_docs,

 from typing import List, Optional, Tuple, Dict
 import uuid
+from .excel_aware_rag import ExcelAwareRAGAgent
 from ..llms.base_llm import BaseLLM
 from src.embeddings.base_embedding import BaseEmbedding
 from src.vectorstores.base_vectorstore import BaseVectorStore
 from src.models.rag import RAGResponse
 from src.utils.logger import logger
+class RAGAgent(ExcelAwareRAGAgent):
     def __init__(
         self,
         llm: BaseLLM,
             max_history_tokens (int): Maximum tokens in conversation history
             max_history_messages (int): Maximum messages to keep in history
         """
+        super().__init__()  # Initialize ExcelAwareRAGAgent
         self.llm = llm
         self.embedding = embedding
         self.vector_store = vector_store
                 sources = None
                 scores = None
+            # Check if this is an Excel-related query and enhance context if needed
+            has_excel_content = any('Sheet:' in doc for doc in (context_docs or []))
+            if has_excel_content:
+                try:
+                    context_docs = self._process_excel_context(context_docs, query)
+                except Exception as e:
+                    logger.warning(f"Error processing Excel context: {str(e)}")
+                    # Continue with original context if Excel processing fails
             # Generate prompt with context and history
             augmented_prompt = self.conversation_manager.generate_prompt_with_history(
                 current_query=query,
                 context_docs=context_docs
             )
+            # Generate initial response using LLM
             response = self.llm.generate(
                 augmented_prompt,
                 temperature=temperature,
                 max_tokens=max_tokens
             )
+            # Enhance response for Excel queries if applicable
+            if has_excel_content:
+                try:
+                    response = await self.enhance_excel_response(
+                        query=query,
+                        response=response,
+                        context_docs=context_docs
+                    )
+                except Exception as e:
+                    logger.warning(f"Error enhancing Excel response: {str(e)}")
+                    # Continue with original response if enhancement fails
             return RAGResponse(
                 response=response,
                 context_docs=context_docs,

src/db/__pycache__/mongodb_store.cpython-312.pyc CHANGED Viewed

Binary files a/src/db/__pycache__/mongodb_store.cpython-312.pyc and b/src/db/__pycache__/mongodb_store.cpython-312.pyc differ

src/db/mongodb_store.py CHANGED Viewed

@@ -62,14 +62,53 @@ class MongoDBStore:
         """Delete document from MongoDB"""
         result = await self.documents.delete_one({"document_id": document_id})
         return result.deleted_count > 0
     # Conversation and chat history methods
     async def create_conversation(
         self,
         conversation_id: str,
-        metadata: Optional[Dict] = None
     ) -> str:
-        """Create a new conversation"""
         conversation = {
             "conversation_id": conversation_id,
             "created_at": datetime.now(),
@@ -77,7 +116,15 @@ class MongoDBStore:
             "message_count": 0,
             "metadata": metadata or {}
         }
         await self.conversations.insert_one(conversation)
         return conversation_id

         """Delete document from MongoDB"""
         result = await self.documents.delete_one({"document_id": document_id})
         return result.deleted_count > 0
+    async def find_existing_user(
+        self,
+        email: str,
+        phone_number: str
+    ) -> Optional[str]:
+        """
+        Find existing user by email or phone number
+        Args:
+            email (str): User's email
+            phone_number (str): User's phone number
+        Returns:
+            Optional[str]: Conversation ID if found, None otherwise
+        """
+        result = await self.conversations.find_one({
+            "$or": [
+                {"email": email},
+                {"phone_number": phone_number}
+            ]
+        })
+        return result["conversation_id"] if result else None
     # Conversation and chat history methods
     async def create_conversation(
         self,
         conversation_id: str,
+        metadata: Optional[Dict] = None,
+        full_name: Optional[str] = None,
+        email: Optional[str] = None,
+        phone_number: Optional[str] = None
     ) -> str:
+        """
+        Create a new conversation
+        Args:
+            conversation_id (str): Unique conversation ID
+            metadata (Optional[Dict]): Additional metadata
+            full_name (Optional[str]): User's full name
+            email (Optional[str]): User's email
+            phone_number (Optional[str]): User's phone number
+        Returns:
+            str: Conversation ID
+        """
         conversation = {
             "conversation_id": conversation_id,
             "created_at": datetime.now(),
             "message_count": 0,
             "metadata": metadata or {}
         }
+        # Add user information if provided
+        if full_name:
+            conversation["full_name"] = full_name
+        if email:
+            conversation["email"] = email
+        if phone_number:
+            conversation["phone_number"] = phone_number
         await self.conversations.insert_one(conversation)
         return conversation_id

src/main.py CHANGED Viewed

@@ -12,6 +12,7 @@ import os
 # Import custom modules1
 from src.agents.rag_agent import RAGAgent
 from src.models.document import AllDocumentsResponse, StoredDocument
 from src.utils.document_processor import DocumentProcessor
 from src.utils.conversation_summarizer import ConversationSummarizer
 from src.utils.logger import logger
@@ -21,12 +22,15 @@ from src.implementations.document_service import DocumentService
 from src.models import (
     ChatRequest,
     ChatResponse,
-    DocumentResponse,
     BatchUploadResponse,
     SummarizeRequest,
     SummaryResponse,
     FeedbackRequest
 )
 from config.config import settings
 app = FastAPI(title="Chatbot API")
@@ -54,6 +58,18 @@ UPLOADS_DIR.mkdir(exist_ok=True)
 # Mount the uploads directory for static file serving
 app.mount("/docs", StaticFiles(directory=str(UPLOADS_DIR)), name="documents")
 @app.get("/documents")
 async def get_all_documents():
     """Get all documents from MongoDB"""
@@ -190,18 +206,72 @@ async def delete_document(document_id: str):
     except Exception as e:
         logger.error(f"Error in delete_document endpoint: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
 @app.post("/chat", response_model=ChatResponse)
 async def chat_endpoint(
     request: ChatRequest,
     background_tasks: BackgroundTasks
 ):
-    """Chat endpoint with RAG support"""
     try:
         vector_store, embedding_model = await get_vector_store()
         llm = get_llm_instance(request.llm_provider)
-        # Initialize RAG agent with required MongoDB
         rag_agent = RAGAgent(
             llm=llm,
             embedding=embedding_model,
@@ -211,14 +281,69 @@ async def chat_endpoint(
         # Use provided conversation ID or create new one
         conversation_id = request.conversation_id or str(uuid.uuid4())
-        query = request.query + ". The response should be short and to the point. make sure, to not add any irrelevant information. Stick to the point is very very important."
         # Generate response
-        response = await rag_agent.generate_response(
-            query=query,
-            conversation_id=conversation_id,
-            temperature=request.temperature
-        )
         # Store message in chat history
         await mongodb.store_message(
             conversation_id=conversation_id,
@@ -228,19 +353,32 @@ async def chat_endpoint(
             sources=response.sources,
             llm_provider=request.llm_provider
         )
-        return ChatResponse(
             response=response.response,
             context=response.context_docs,
             sources=response.sources,
             conversation_id=conversation_id,
             timestamp=datetime.now(),
-            relevant_doc_scores=response.scores if hasattr(response, 'scores') else None
         )
     except Exception as e:
-        logger.error(f"Error in chat endpoint: {str(e)}")
-        raise HTTPException(status_code=500, detail=str(e))
 @app.get("/chat/history/{conversation_id}")
 async def get_conversation_history(conversation_id: str):
@@ -347,6 +485,27 @@ async def debug_config():
     return debug_info
 @app.get("/health")
 async def health_check():
     """Health check endpoint"""

 # Import custom modules1
 from src.agents.rag_agent import RAGAgent
 from src.models.document import AllDocumentsResponse, StoredDocument
+from src.models.UserContact import UserContactRequest
 from src.utils.document_processor import DocumentProcessor
 from src.utils.conversation_summarizer import ConversationSummarizer
 from src.utils.logger import logger
 from src.models import (
     ChatRequest,
     ChatResponse,
     BatchUploadResponse,
     SummarizeRequest,
     SummaryResponse,
     FeedbackRequest
 )
+from fastapi import HTTPException, Depends
+from fastapi.security import APIKeyHeader
+from src.utils.database_cleanup import perform_cleanup
 from config.config import settings
 app = FastAPI(title="Chatbot API")
 # Mount the uploads directory for static file serving
 app.mount("/docs", StaticFiles(directory=str(UPLOADS_DIR)), name="documents")
+# Security setup
+API_KEY_HEADER = APIKeyHeader(name="ADMIN_API_KEY")
+async def verify_api_key(api_key: str = Depends(API_KEY_HEADER)):
+    """Verify admin API key"""
+    if not settings.ADMIN_API_KEY or api_key != settings.ADMIN_API_KEY:
+        raise HTTPException(
+            status_code=403,
+            detail="Invalid or missing API key"
+        )
+    return api_key
 @app.get("/documents")
 async def get_all_documents():
     """Get all documents from MongoDB"""
     except Exception as e:
         logger.error(f"Error in delete_document endpoint: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
+# src/main.py
+@app.post("/user/contact", response_model=ChatResponse)
+async def create_user_contact(
+    request: UserContactRequest,
+    background_tasks: BackgroundTasks
+):
+    """Create or retrieve user conversation based on contact information"""
+    try:
+        # Check for existing user
+        existing_conversation_id = await mongodb.find_existing_user(
+            email=request.email,
+            phone_number=request.phone_number
+        )
+        if existing_conversation_id:
+            chat_request = ChatRequest(
+                query=f'An old user with name: "{request.full_name}", email: "{request.email}" and phone number: "{request.phone_number}" wants support again. Create a welcome back message for him and ask how i can help you today?',
+                llm_provider="openai",
+                max_context_docs=3,
+                temperature=1.0,
+                stream=False,
+                conversation_id=existing_conversation_id
+            )
+        else:
+            # Create new conversation with user information
+            new_conversation_id = str(uuid.uuid4())
+            await mongodb.create_conversation(
+                conversation_id=new_conversation_id,
+                full_name=request.full_name,
+                email=request.email,
+                phone_number=request.phone_number
+            )
+            chat_request = ChatRequest(
+                query=f'A new user with name: "{request.full_name}", email: "{request.email}" and phone number: "{request.phone_number}" wants support. Create a welcome message for him and ask how i can help you today?',
+                llm_provider="openai",
+                max_context_docs=3,
+                temperature=1.0,
+                stream=False,
+                conversation_id=new_conversation_id
+            )
+        # Call chat_endpoint with the prepared request
+        return await chat_endpoint(chat_request, background_tasks)
+    except Exception as e:
+        logger.error(f"Error in create_user_contact: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
 @app.post("/chat", response_model=ChatResponse)
 async def chat_endpoint(
     request: ChatRequest,
     background_tasks: BackgroundTasks
 ):
+    """Chat endpoint with RAG support and enhanced Excel handling"""
     try:
+        # Initialize core components
+        logger.info(f"Initializing vector store and embedding: {str(datetime.now())}")
         vector_store, embedding_model = await get_vector_store()
+        logger.info(f"Initializing LLM: {str(datetime.now())}")
         llm = get_llm_instance(request.llm_provider)
+        # Initialize RAG agent
         rag_agent = RAGAgent(
             llm=llm,
             embedding=embedding_model,
         # Use provided conversation ID or create new one
         conversation_id = request.conversation_id or str(uuid.uuid4())
+        # Process the query
+        query = request.query
+        # Add specific instructions for certain types of queries
+        #if "introduce" in query.lower() or "name" in query.lower() or "email" in query.lower():
+        query += ". The response should be short and to the point. Make sure to not add any irrelevant information. Keep the introduction concise and friendly."
         # Generate response
+        logger.info(f"Generating response: {str(datetime.now())}")
+        max_retries = 3
+        retry_count = 0
+        response = None
+        last_error = None
+        while retry_count < max_retries and response is None:
+            try:
+                response = await rag_agent.generate_response(
+                    query=query,
+                    conversation_id=conversation_id,
+                    temperature=request.temperature,
+                    max_tokens=request.max_tokens if hasattr(request, 'max_tokens') else None
+                )
+                break
+            except Exception as e:
+                last_error = e
+                retry_count += 1
+                logger.warning(f"Attempt {retry_count} failed: {str(e)}")
+                await asyncio.sleep(1)  # Brief pause before retry
+        if response is None:
+            raise last_error or Exception("Failed to generate response after retries")
+        logger.info(f"Response generated: {str(datetime.now())}")
+        # Prepare response metadata
+        metadata = {
+            'llm_provider': request.llm_provider,
+            'temperature': request.temperature,
+            'conversation_id': conversation_id
+        }
+        # Add Excel-specific metadata if present
+        has_excel_content = any(
+            doc and 'Sheet:' in doc
+            for doc in (response.context_docs or [])
+        )
+        if has_excel_content:
+            try:
+                metadata['excel_content'] = True
+                # Extract Excel-specific insights if available
+                if hasattr(rag_agent, 'get_excel_insights'):
+                    excel_insights = rag_agent.get_excel_insights(
+                        query=query,
+                        context_docs=response.context_docs
+                    )
+                    if excel_insights:
+                        metadata['excel_insights'] = excel_insights
+            except Exception as e:
+                logger.warning(f"Error processing Excel metadata: {str(e)}")
         # Store message in chat history
         await mongodb.store_message(
             conversation_id=conversation_id,
             sources=response.sources,
             llm_provider=request.llm_provider
         )
+        # Prepare and return response
+        chat_response = ChatResponse(
             response=response.response,
             context=response.context_docs,
             sources=response.sources,
             conversation_id=conversation_id,
             timestamp=datetime.now(),
+            relevant_doc_scores=response.scores if hasattr(response, 'scores') else None,
+            metadata=metadata
         )
+        # Log completion
+        logger.info(f"Chat response completed: {str(datetime.now())}")
+        return chat_response
     except Exception as e:
+        logger.error(f"Error in chat endpoint: {str(e)}", exc_info=True)
+        # Convert known errors to HTTPException with appropriate status codes
+        if isinstance(e, ValueError):
+            raise HTTPException(status_code=400, detail=str(e))
+        elif isinstance(e, (KeyError, AttributeError)):
+            raise HTTPException(status_code=500, detail="Internal processing error")
+        else:
+            raise HTTPException(status_code=500, detail=str(e))
 @app.get("/chat/history/{conversation_id}")
 async def get_conversation_history(conversation_id: str):
     return debug_info
+@app.post("/admin/cleanup")
+async def cleanup_databases(
+    include_files: bool = True,
+    api_key: str = Depends(verify_api_key)
+):
+    """
+    Clean up all data from ChromaDB and MongoDB
+    Args:
+        include_files (bool): Whether to also delete uploaded files
+    """
+    try:
+        result = await perform_cleanup(mongodb, include_files)
+        return result
+    except Exception as e:
+        logger.error(f"Error in cleanup operation: {str(e)}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"Error during cleanup: {str(e)}"
+        )
 @app.get("/health")
 async def health_check():
     """Health check endpoint"""

src/models/UserContact.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from pydantic import BaseModel, EmailStr, validator
+import re
+class UserContactRequest(BaseModel):
+    """Request model for user contact information"""
+    full_name: str
+    email: EmailStr
+    phone_number: str
+    @validator('phone_number')
+    def validate_phone(cls, v):
+        # Remove any non-digit characters
+        phone = re.sub(r'\D', '', v)
+        if not 8 <= len(phone) <= 15:  # Standard phone number length globally
+            raise ValueError('Invalid phone number length')
+        return phone
+    @validator('full_name')
+    def validate_name(cls, v):
+        if not v.strip():
+            raise ValueError('Name cannot be empty')
+        return v.strip()
+class UserContactResponse(BaseModel):
+    """Response model for user contact endpoint"""
+    conversation_id: str
+    is_existing: bool
+    message: str

src/models/__pycache__/UserContact.cpython-312.pyc ADDED Viewed

Binary file (1.79 kB). View file

src/utils/__pycache__/database_cleanup.cpython-312.pyc ADDED Viewed

Binary file (7.04 kB). View file

src/utils/__pycache__/document_processor.cpython-312.pyc CHANGED Viewed

Binary files a/src/utils/__pycache__/document_processor.cpython-312.pyc and b/src/utils/__pycache__/document_processor.cpython-312.pyc differ

src/utils/__pycache__/enhanced_excel_processor.cpython-312.pyc ADDED Viewed

Binary file (10.5 kB). View file

src/utils/__pycache__/llm_utils.cpython-312.pyc CHANGED Viewed

Binary files a/src/utils/__pycache__/llm_utils.cpython-312.pyc and b/src/utils/__pycache__/llm_utils.cpython-312.pyc differ

src/utils/database_cleanup.py ADDED Viewed

	@@ -0,0 +1,182 @@

+# src/utils/database_cleanup.py
+from typing import List, Dict
+import chromadb
+import shutil
+from pathlib import Path
+from src.utils.logger import logger
+from config.config import settings
+async def cleanup_chroma():
+    """Clean up ChromaDB vector store"""
+    try:
+        # Initialize client with allow_reset=True
+        client = chromadb.PersistentClient(
+            path=settings.CHROMA_PATH,
+            settings=chromadb.Settings(
+                allow_reset=True,
+                is_persistent=True
+            )
+        )
+        # Get collection names
+        collection_names = client.list_collections()
+        # Delete each collection by name
+        for name in collection_names:
+            client.delete_collection(name)
+        # Reset client
+        client.reset()
+        # Remove persistence directory
+        path = Path(settings.CHROMA_PATH)
+        if path.exists():
+            shutil.rmtree(path)
+        return ["All vector store data cleared"]
+    except Exception as e:
+        raise Exception(f"ChromaDB cleanup failed: {str(e)}")
+async def cleanup_mongodb(mongodb) -> List[str]:
+    """
+    Clean up MongoDB collections
+    Args:
+        mongodb: MongoDB store instance
+    Returns:
+        List[str]: Details of cleanup operations
+    """
+    details = []
+    try:
+        # Drop all collections
+        await mongodb.chat_history.delete_many({})
+        details.append("Cleared chat history")
+        await mongodb.conversations.delete_many({})
+        details.append("Cleared conversations")
+        await mongodb.documents.delete_many({})
+        details.append("Cleared document metadata")
+        await mongodb.knowledge_base.delete_many({})
+        details.append("Cleared knowledge base")
+        if hasattr(mongodb.db, 'vector_metadata'):
+            await mongodb.db.vector_metadata.delete_many({})
+            details.append("Cleared vector metadata")
+        return details
+    except Exception as e:
+        raise Exception(f"MongoDB cleanup failed: {str(e)}")
+async def cleanup_files() -> List[str]:
+    """
+    Clean up uploaded files
+    Returns:
+        List[str]: Details of cleanup operations
+    """
+    details = []
+    uploads_dir = Path("uploads")
+    if uploads_dir.exists():
+        # Get list of files before deletion
+        files = list(uploads_dir.glob('*'))
+        # Delete all files
+        for file in files:
+            if file.is_file():
+                file.unlink()
+                details.append(f"Deleted file: {file.name}")
+        # Try to remove the directory itself
+        if not any(uploads_dir.iterdir()):
+            uploads_dir.rmdir()
+            details.append("Removed empty uploads directory")
+    else:
+        details.append("No uploads directory found")
+    return details
+async def perform_cleanup(
+    mongodb,
+    include_files: bool = True
+) -> Dict:
+    """
+    Perform comprehensive cleanup of all databases
+    Args:
+        mongodb: MongoDB store instance
+        include_files (bool): Whether to also delete uploaded files
+    Returns:
+        Dict: Cleanup operation summary
+    """
+    cleanup_summary = {
+        "chroma_db": {"status": "not_started", "details": []},
+        "mongodb": {"status": "not_started", "details": []},
+        "files": {"status": "not_started", "details": []}
+    }
+    try:
+        # Clean ChromaDB
+        try:
+            details = await cleanup_chroma()
+            cleanup_summary["chroma_db"] = {
+                "status": "success",
+                "details": details
+            }
+        except Exception as e:
+            logger.error(f"Error cleaning ChromaDB: {str(e)}")
+            cleanup_summary["chroma_db"] = {
+                "status": "error",
+                "details": [str(e)]
+            }
+        # Clean MongoDB
+        try:
+            details = await cleanup_mongodb(mongodb)
+            cleanup_summary["mongodb"] = {
+                "status": "success",
+                "details": details
+            }
+        except Exception as e:
+            logger.error(f"Error cleaning MongoDB: {str(e)}")
+            cleanup_summary["mongodb"] = {
+                "status": "error",
+                "details": [str(e)]
+            }
+        # Clean files if requested
+        if include_files:
+            try:
+                details = await cleanup_files()
+                cleanup_summary["files"] = {
+                    "status": "success",
+                    "details": details
+                }
+            except Exception as e:
+                logger.error(f"Error cleaning files: {str(e)}")
+                cleanup_summary["files"] = {
+                    "status": "error",
+                    "details": [str(e)]
+                }
+        # Determine overall status
+        overall_status = "success"
+        if any(item["status"] == "error" for item in cleanup_summary.values()):
+            overall_status = "partial_success"
+        if all(item["status"] == "error" for item in cleanup_summary.values()):
+            overall_status = "error"
+        return {
+            "status": overall_status,
+            "message": "Cleanup operation completed",
+            "details": cleanup_summary
+        }
+    except Exception as e:
+        logger.error(f"Error in cleanup operation: {str(e)}")
+        raise

src/utils/document_processor.py CHANGED Viewed

@@ -8,13 +8,15 @@ from pathlib import Path
 import hashlib
 import magic  # python-magic library for file type detection
 from bs4 import BeautifulSoup
-import requests
 import csv
 from datetime import datetime
 import threading
 from queue import Queue
 import tiktoken
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 class DocumentProcessor:
     def __init__(
@@ -29,11 +31,26 @@ class DocumentProcessor:
         self.max_file_size = max_file_size
         self.supported_formats = supported_formats or [
             '.txt', '.pdf', '.docx', '.csv', '.json',
-            '.html', '.md', '.xml', '.rtf'
         ]
         self.processing_queue = Queue()
         self.processed_docs = {}
         self._initialize_text_splitter()
     def _initialize_text_splitter(self):
         """Initialize the text splitter with custom settings"""
@@ -44,65 +61,10 @@ class DocumentProcessor:
             separators=["\n\n", "\n", " ", ""]
         )
-    async def process_document(
-        self,
-        file_path: Union[str, Path],
-        metadata: Optional[Dict] = None
-    ) -> Dict:
-        """
-        Process a document with metadata and content extraction
-        """
-        file_path = Path(file_path)
-        # Basic validation
-        if not self._validate_file(file_path):
-            raise ValueError(f"Invalid file: {file_path}")
-        # Extract content based on file type
-        content = self._extract_content(file_path)
-        # Generate document metadata
-        doc_metadata = self._generate_metadata(file_path, content, metadata)
-        # Split content into chunks
-        chunks = self.text_splitter.split_text(content)
-        # Calculate embeddings chunk hashes
-        chunk_hashes = [self._calculate_hash(chunk) for chunk in chunks]
-        return {
-            'content': content,
-            'chunks': chunks,
-            'chunk_hashes': chunk_hashes,
-            'metadata': doc_metadata,
-            'statistics': self._generate_statistics(content, chunks)
-        }
-    def _validate_file(self, file_path: Path) -> bool:
-        """
-        Validate file type, size, and content
-        """
-        if not file_path.exists():
-            raise FileNotFoundError(f"File not found: {file_path}")
-        if file_path.suffix.lower() not in self.supported_formats:
-            raise ValueError(f"Unsupported file format: {file_path.suffix}")
-        if file_path.stat().st_size > self.max_file_size:
-            raise ValueError(f"File too large: {file_path}")
-        # Check if file is not empty
-        if file_path.stat().st_size == 0:
-            raise ValueError(f"Empty file: {file_path}")
-        return True
     def _extract_content(self, file_path: Path) -> str:
-        """
-        Extract content from different file formats
-        """
         suffix = file_path.suffix.lower()
         try:
             if suffix == '.pdf':
                 return self._extract_pdf(file_path)
@@ -114,13 +76,28 @@ class DocumentProcessor:
                 return self._extract_json(file_path)
             elif suffix == '.html':
                 return self._extract_html(file_path)
-            elif suffix == '.txt':
-                return file_path.read_text(encoding='utf-8')
             else:
                 raise ValueError(f"Unsupported format: {suffix}")
         except Exception as e:
             raise Exception(f"Error extracting content from {file_path}: {str(e)}")
     def _extract_pdf(self, file_path: Path) -> str:
         """Extract text from PDF with advanced features"""
         text = ""
@@ -135,7 +112,6 @@ class DocumentProcessor:
                 if '/XObject' in page['/Resources']:
                     for obj in page['/Resources']['/XObject'].get_object():
                         if page['/Resources']['/XObject'][obj]['/Subtype'] == '/Image':
-                            # Process images if needed
                             pass
         return text.strip()
@@ -148,7 +124,6 @@ class DocumentProcessor:
         for para in doc.paragraphs:
             full_text.append(para.text)
-        # Extract tables if present
         for table in doc.tables:
             for row in table.rows:
                 row_text = [cell.text for cell in row.cells]
@@ -172,7 +147,6 @@ class DocumentProcessor:
         with open(file_path) as f:
             soup = BeautifulSoup(f, 'html.parser')
-        # Remove script and style elements
         for script in soup(["script", "style"]):
             script.decompose()
@@ -180,6 +154,83 @@ class DocumentProcessor:
         lines = [line.strip() for line in text.splitlines() if line.strip()]
         return "\n\n".join(lines)
     def _generate_metadata(
         self,
         file_path: Path,
@@ -202,11 +253,64 @@ class DocumentProcessor:
             'processing_timestamp': datetime.now().isoformat()
         }
         if additional_metadata:
             metadata.update(additional_metadata)
         return metadata
     def _generate_statistics(self, content: str, chunks: List[str]) -> Dict:
         """Generate document statistics"""
         return {
@@ -217,18 +321,12 @@ class DocumentProcessor:
             'sentences': len([s for s in content.split('.') if s.strip()]),
         }
-    def _calculate_hash(self, text: str) -> str:
-        """Calculate SHA-256 hash of text"""
-        return hashlib.sha256(text.encode()).hexdigest()
     async def batch_process(
         self,
         file_paths: List[Union[str, Path]],
         parallel: bool = True
     ) -> Dict[str, Dict]:
-        """
-        Process multiple documents in parallel
-        """
         results = {}
         if parallel:

 import hashlib
 import magic  # python-magic library for file type detection
 from bs4 import BeautifulSoup
 import csv
 from datetime import datetime
 import threading
 from queue import Queue
 import tiktoken
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+import logging
+from bs4.element import ProcessingInstruction
+from .enhanced_excel_processor import EnhancedExcelProcessor
 class DocumentProcessor:
     def __init__(
         self.max_file_size = max_file_size
         self.supported_formats = supported_formats or [
             '.txt', '.pdf', '.docx', '.csv', '.json',
+            '.html', '.md', '.xml', '.rtf', '.xlsx', '.xls'
         ]
         self.processing_queue = Queue()
         self.processed_docs = {}
         self._initialize_text_splitter()
+        # Initialize Excel processor
+        self.excel_processor = EnhancedExcelProcessor()
+        # Check for required packages
+        try:
+            import striprtf.striprtf
+        except ImportError:
+            logging.warning("Warning: striprtf package not found. RTF support will be limited.")
+        try:
+            from bs4 import BeautifulSoup
+            import lxml
+        except ImportError:
+            logging.warning("Warning: beautifulsoup4 or lxml package not found. XML support will be limited.")
     def _initialize_text_splitter(self):
         """Initialize the text splitter with custom settings"""
             separators=["\n\n", "\n", " ", ""]
         )
     def _extract_content(self, file_path: Path) -> str:
+        """Extract content from different file formats"""
         suffix = file_path.suffix.lower()
         try:
             if suffix == '.pdf':
                 return self._extract_pdf(file_path)
                 return self._extract_json(file_path)
             elif suffix == '.html':
                 return self._extract_html(file_path)
+            elif suffix == '.txt' or suffix == '.md':
+                return self._extract_text(file_path)
+            elif suffix == '.xml':
+                return self._extract_xml(file_path)
+            elif suffix == '.rtf':
+                return self._extract_rtf(file_path)
+            elif suffix in ['.xlsx', '.xls']:
+                return self._extract_excel(file_path)
             else:
                 raise ValueError(f"Unsupported format: {suffix}")
         except Exception as e:
             raise Exception(f"Error extracting content from {file_path}: {str(e)}")
+    def _extract_text(self, file_path: Path) -> str:
+        """Extract content from text-based files"""
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                return f.read()
+        except UnicodeDecodeError:
+            with open(file_path, 'r', encoding='latin-1') as f:
+                return f.read()
     def _extract_pdf(self, file_path: Path) -> str:
         """Extract text from PDF with advanced features"""
         text = ""
                 if '/XObject' in page['/Resources']:
                     for obj in page['/Resources']['/XObject'].get_object():
                         if page['/Resources']['/XObject'][obj]['/Subtype'] == '/Image':
                             pass
         return text.strip()
         for para in doc.paragraphs:
             full_text.append(para.text)
         for table in doc.tables:
             for row in table.rows:
                 row_text = [cell.text for cell in row.cells]
         with open(file_path) as f:
             soup = BeautifulSoup(f, 'html.parser')
         for script in soup(["script", "style"]):
             script.decompose()
         lines = [line.strip() for line in text.splitlines() if line.strip()]
         return "\n\n".join(lines)
+    def _extract_xml(self, file_path: Path) -> str:
+        """Extract text from XML with structure preservation"""
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                soup = BeautifulSoup(f, 'xml')
+            for pi in soup.find_all(text=lambda text: isinstance(text, ProcessingInstruction)):
+                pi.extract()
+            text = soup.get_text(separator='\n')
+            lines = [line.strip() for line in text.splitlines() if line.strip()]
+            return "\n\n".join(lines)
+        except Exception as e:
+            raise Exception(f"Error processing XML file: {str(e)}")
+    def _extract_rtf(self, file_path: Path) -> str:
+        """Extract text from RTF files"""
+        try:
+            import striprtf.striprtf as striprtf
+            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+                rtf_text = f.read()
+            plain_text = striprtf.rtf_to_text(rtf_text)
+            lines = [line.strip() for line in plain_text.splitlines() if line.strip()]
+            return "\n\n".join(lines)
+        except ImportError:
+            raise ImportError("striprtf package is required for RTF support.")
+        except Exception as e:
+            raise Exception(f"Error processing RTF file: {str(e)}")
+    def _extract_excel(self, file_path: Path) -> str:
+        """Extract content from Excel files with enhanced processing"""
+        try:
+            # Use enhanced Excel processor
+            processed_content = self.excel_processor.process_excel(file_path)
+            # If processing fails, fall back to basic processing
+            if not processed_content:
+                logging.warning(f"Enhanced Excel processing failed for {file_path}, falling back to basic processing")
+                return self._basic_excel_extract(file_path)
+            return processed_content
+        except Exception as e:
+            logging.error(f"Error in enhanced Excel processing: {str(e)}")
+            # Fall back to basic Excel processing
+            return self._basic_excel_extract(file_path)
+    def _basic_excel_extract(self, file_path: Path) -> str:
+        """Basic Excel extraction as fallback"""
+        try:
+            excel_file = pd.ExcelFile(file_path)
+            sheets_data = []
+            for sheet_name in excel_file.sheet_names:
+                df = pd.read_excel(excel_file, sheet_name=sheet_name)
+                sheet_content = f"\nSheet: {sheet_name}\n"
+                sheet_content += "=" * (len(sheet_name) + 7) + "\n"
+                if df.empty:
+                    sheet_content += "Empty Sheet\n"
+                else:
+                    sheet_content += df.fillna('').to_string(
+                        index=False,
+                        max_rows=None,
+                        max_cols=None,
+                        line_width=120
+                    ) + "\n"
+                sheets_data.append(sheet_content)
+            return "\n\n".join(sheets_data)
+        except Exception as e:
+            raise Exception(f"Error in basic Excel processing: {str(e)}")
     def _generate_metadata(
         self,
         file_path: Path,
             'processing_timestamp': datetime.now().isoformat()
         }
+        # Add Excel-specific metadata if applicable
+        if file_path.suffix.lower() in ['.xlsx', '.xls']:
+            try:
+                if hasattr(self.excel_processor, 'get_metadata'):
+                    excel_metadata = self.excel_processor.get_metadata()
+                    metadata.update({'excel_metadata': excel_metadata})
+            except Exception as e:
+                logging.warning(f"Could not extract Excel metadata: {str(e)}")
         if additional_metadata:
             metadata.update(additional_metadata)
         return metadata
+    def _calculate_hash(self, text: str) -> str:
+        """Calculate SHA-256 hash of text"""
+        return hashlib.sha256(text.encode()).hexdigest()
+    async def process_document(
+        self,
+        file_path: Union[str, Path],
+        metadata: Optional[Dict] = None
+    ) -> Dict:
+        """Process a document with metadata and content extraction"""
+        file_path = Path(file_path)
+        if not self._validate_file(file_path):
+            raise ValueError(f"Invalid file: {file_path}")
+        content = self._extract_content(file_path)
+        doc_metadata = self._generate_metadata(file_path, content, metadata)
+        chunks = self.text_splitter.split_text(content)
+        chunk_hashes = [self._calculate_hash(chunk) for chunk in chunks]
+        return {
+            'content': content,
+            'chunks': chunks,
+            'chunk_hashes': chunk_hashes,
+            'metadata': doc_metadata,
+            'statistics': self._generate_statistics(content, chunks)
+        }
+    def _validate_file(self, file_path: Path) -> bool:
+        """Validate file type, size, and content"""
+        if not file_path.exists():
+            raise FileNotFoundError(f"File not found: {file_path}")
+        if file_path.suffix.lower() not in self.supported_formats:
+            raise ValueError(f"Unsupported file format: {file_path.suffix}")
+        if file_path.stat().st_size > self.max_file_size:
+            raise ValueError(f"File too large: {file_path}")
+        if file_path.stat().st_size == 0:
+            raise ValueError(f"Empty file: {file_path}")
+        return True
     def _generate_statistics(self, content: str, chunks: List[str]) -> Dict:
         """Generate document statistics"""
         return {
             'sentences': len([s for s in content.split('.') if s.strip()]),
         }
     async def batch_process(
         self,
         file_paths: List[Union[str, Path]],
         parallel: bool = True
     ) -> Dict[str, Dict]:
+        """Process multiple documents in parallel"""
         results = {}
         if parallel:

src/utils/enhanced_excel_processor.py ADDED Viewed

	@@ -0,0 +1,187 @@

+from typing import Dict, List, Any, Optional
+import pandas as pd
+import numpy as np
+from pathlib import Path
+import json
+class EnhancedExcelProcessor:
+    def __init__(self):
+        """Initialize the enhanced Excel processor"""
+        self.sheet_summaries = {}
+        self.relationships = {}
+        self.sheet_metadata = {}
+    def process_excel(self, file_path: Path) -> str:
+        """
+        Process Excel file with enhanced multi-sheet handling
+        Args:
+            file_path (Path): Path to Excel file
+        Returns:
+            str: Structured text representation of Excel content
+        """
+        # Read all sheets
+        excel_file = pd.ExcelFile(file_path)
+        sheets_data = {}
+        for sheet_name in excel_file.sheet_names:
+            df = pd.read_excel(excel_file, sheet_name=sheet_name)
+            sheets_data[sheet_name] = df
+            # Generate sheet summary
+            self.sheet_summaries[sheet_name] = self._generate_sheet_summary(df)
+            # Extract sheet metadata
+            self.sheet_metadata[sheet_name] = {
+                'columns': list(df.columns),
+                'rows': len(df),
+                'numeric_columns': df.select_dtypes(include=[np.number]).columns.tolist(),
+                'date_columns': df.select_dtypes(include=['datetime64']).columns.tolist(),
+                'categorical_columns': df.select_dtypes(include=['object']).columns.tolist()
+            }
+        # Detect relationships between sheets
+        self.relationships = self._detect_relationships(sheets_data)
+        # Generate structured text representation
+        return self._generate_structured_text(sheets_data)
+    def _generate_sheet_summary(self, df: pd.DataFrame) -> Dict:
+        """Generate statistical summary for a sheet"""
+        summary = {
+            'total_rows': len(df),
+            'total_columns': len(df.columns),
+            'column_types': {},
+            'numeric_summaries': {},
+            'categorical_summaries': {},
+            'null_counts': df.isnull().sum().to_dict()
+        }
+        # Process numeric columns
+        numeric_cols = df.select_dtypes(include=[np.number]).columns
+        for col in numeric_cols:
+            summary['numeric_summaries'][col] = {
+                'mean': float(df[col].mean()),
+                'median': float(df[col].median()),
+                'std': float(df[col].std()),
+                'min': float(df[col].min()),
+                'max': float(df[col].max())
+            }
+            summary['column_types'][col] = 'numeric'
+        # Process categorical columns
+        categorical_cols = df.select_dtypes(include=['object']).columns
+        for col in categorical_cols:
+            value_counts = df[col].value_counts()
+            summary['categorical_summaries'][col] = {
+                'unique_values': int(len(value_counts)),
+                'top_values': value_counts.head(5).to_dict()
+            }
+            summary['column_types'][col] = 'categorical'
+        return summary
+    def _detect_relationships(self, sheets_data: Dict[str, pd.DataFrame]) -> Dict:
+        """Detect potential relationships between sheets"""
+        relationships = {}
+        sheet_names = list(sheets_data.keys())
+        for i, sheet1 in enumerate(sheet_names):
+            for sheet2 in sheet_names[i+1:]:
+                common_cols = set(sheets_data[sheet1].columns) & set(sheets_data[sheet2].columns)
+                if common_cols:
+                    relationships[f"{sheet1}__{sheet2}"] = {
+                        'common_columns': list(common_cols),
+                        'type': 'potential_join'
+                    }
+                # Check for foreign key relationships
+                for col1 in sheets_data[sheet1].columns:
+                    for col2 in sheets_data[sheet2].columns:
+                        if (col1.lower().endswith('_id') or col2.lower().endswith('_id')):
+                            unique_vals1 = set(sheets_data[sheet1][col1].dropna())
+                            unique_vals2 = set(sheets_data[sheet2][col2].dropna())
+                            if unique_vals1 & unique_vals2:
+                                relationships[f"{sheet1}__{sheet2}__{col1}__{col2}"] = {
+                                    'type': 'foreign_key',
+                                    'columns': [col1, col2]
+                                }
+        return relationships
+    def _generate_structured_text(self, sheets_data: Dict[str, pd.DataFrame]) -> str:
+        """Generate structured text representation of Excel content"""
+        output_parts = []
+        # Overall summary
+        output_parts.append(f"Excel File Overview:")
+        output_parts.append(f"Total Sheets: {len(sheets_data)}")
+        output_parts.append("")
+        # Sheet details
+        for sheet_name, df in sheets_data.items():
+            output_parts.append(f"Sheet: {sheet_name}")
+            output_parts.append("=" * (len(sheet_name) + 7))
+            metadata = self.sheet_metadata[sheet_name]
+            summary = self.sheet_summaries[sheet_name]
+            # Basic info
+            output_parts.append(f"Rows: {metadata['rows']}")
+            output_parts.append(f"Columns: {', '.join(metadata['columns'])}")
+            output_parts.append("")
+            # Column summaries
+            if metadata['numeric_columns']:
+                output_parts.append("Numeric Columns Summary:")
+                for col in metadata['numeric_columns']:
+                    stats = summary['numeric_summaries'][col]
+                    output_parts.append(f"  {col}:")
+                    output_parts.append(f"    Range: {stats['min']} to {stats['max']}")
+                    output_parts.append(f"    Average: {stats['mean']:.2f}")
+                output_parts.append("")
+            if metadata['categorical_columns']:
+                output_parts.append("Categorical Columns Summary:")
+                for col in metadata['categorical_columns']:
+                    cats = summary['categorical_summaries'][col]
+                    output_parts.append(f"  {col}:")
+                    output_parts.append(f"    Unique Values: {cats['unique_values']}")
+                    if cats['top_values']:
+                        output_parts.append("    Top Values: " +
+                                         ", ".join(f"{k} ({v})" for k, v in
+                                                 list(cats['top_values'].items())[:3]))
+                output_parts.append("")
+            # Sample data
+            output_parts.append("Sample Data:")
+            output_parts.append(df.head(3).to_string())
+            output_parts.append("\n")
+        # Relationships
+        if self.relationships:
+            output_parts.append("Sheet Relationships:")
+            for rel_key, rel_info in self.relationships.items():
+                if rel_info['type'] == 'potential_join':
+                    sheets = rel_key.split('__')
+                    output_parts.append(f"- {sheets[0]} and {sheets[1]} share columns: " +
+                                     f"{', '.join(rel_info['common_columns'])}")
+                elif rel_info['type'] == 'foreign_key':
+                    parts = rel_key.split('__')
+                    output_parts.append(f"- Potential foreign key relationship between " +
+                                     f"{parts[0]}.{parts[2]} and {parts[1]}.{parts[3]}")
+        return "\n".join(output_parts)
+    def get_sheet_summary(self, sheet_name: str) -> Optional[Dict]:
+        """Get summary for a specific sheet"""
+        return self.sheet_summaries.get(sheet_name)
+    def get_relationships(self) -> Dict:
+        """Get detected relationships between sheets"""
+        return self.relationships
+    def get_metadata(self) -> Dict:
+        """Get complete metadata for all sheets"""
+        return self.sheet_metadata

src/utils/excel_integration ADDED Viewed

	@@ -0,0 +1,139 @@

+from typing import Dict, Any
+from pathlib import Path
+class ExcelIntegration:
+    def __init__(self, enhanced_processor):
+        """
+        Initialize Excel integration
+        Args:
+            enhanced_processor: Instance of EnhancedExcelProcessor
+        """
+        self.processor = enhanced_processor
+    def process_for_rag(self, file_path: Path) -> Dict[str, Any]:
+        """
+        Process Excel file for RAG system
+        Args:
+            file_path (Path): Path to Excel file
+        Returns:
+            Dict[str, Any]: Processed content and metadata
+        """
+        # Process Excel file
+        content = self.processor.process_excel(file_path)
+        # Get all metadata
+        metadata = {
+            'sheet_summaries': self.processor.sheet_summaries,
+            'relationships': self.processor.relationships,
+            'sheet_metadata': self.processor.sheet_metadata
+        }
+        # Create chunks based on logical divisions
+        chunks = self._create_semantic_chunks(content)
+        return {
+            'content': content,
+            'chunks': chunks,
+            'metadata': metadata
+        }
+    def _create_semantic_chunks(self, content: str) -> list:
+        """
+        Create meaningful chunks from Excel content
+        Args:
+            content (str): Processed Excel content
+        Returns:
+            list: List of content chunks
+        """
+        chunks = []
+        current_chunk = []
+        current_sheet = None
+        for line in content.split('\n'):
+            # Start new chunk for each sheet
+            if line.startswith('Sheet: '):
+                if current_chunk:
+                    chunks.append('\n'.join(current_chunk))
+                    current_chunk = []
+                current_sheet = line
+                current_chunk.append(line)
+            # Start new chunk for major sections within sheet
+            elif any(line.startswith(section) for section in
+                    ['Numeric Columns Summary:', 'Categorical Columns Summary:',
+                     'Sample Data:', 'Sheet Relationships:']):
+                if current_chunk:
+                    chunks.append('\n'.join(current_chunk))
+                    current_chunk = []
+                if current_sheet:
+                    current_chunk.append(current_sheet)
+                current_chunk.append(line)
+            else:
+                current_chunk.append(line)
+        # Add final chunk
+        if current_chunk:
+            chunks.append('\n'.join(current_chunk))
+        return chunks
+    def get_sheet_context(self, sheet_name: str) -> str:
+        """
+        Get specific context for a sheet
+        Args:
+            sheet_name (str): Name of the sheet
+        Returns:
+            str: Contextual information about the sheet
+        """
+        if sheet_name not in self.processor.sheet_metadata:
+            return ""
+        metadata = self.processor.sheet_metadata[sheet_name]
+        summary = self.processor.sheet_summaries[sheet_name]
+        context_parts = [
+            f"Sheet: {sheet_name}",
+            f"Total Rows: {metadata['rows']}",
+            f"Columns: {', '.join(metadata['columns'])}",
+        ]
+        # Add numeric column summaries
+        if metadata['numeric_columns']:
+            context_parts.append("\nNumeric Columns:")
+            for col in metadata['numeric_columns']:
+                stats = summary['numeric_summaries'][col]
+                context_parts.append(f"- {col}: Range {stats['min']} to {stats['max']}, "
+                                  f"Average {stats['mean']:.2f}")
+        # Add categorical column summaries
+        if metadata['categorical_columns']:
+            context_parts.append("\nCategorical Columns:")
+            for col in metadata['categorical_columns']:
+                cats = summary['categorical_summaries'][col]
+                context_parts.append(f"- {col}: {cats['unique_values']} unique values")
+        return "\n".join(context_parts)
+    def get_relationship_context(self) -> str:
+        """
+        Get context about relationships between sheets
+        Returns:
+            str: Information about sheet relationships
+        """
+        if not self.processor.relationships:
+            return "No relationships detected between sheets."
+        context_parts = ["Sheet Relationships:"]
+        for rel_key, rel_info in self.processor.relationships.items():
+            if rel_info['type'] == 'potential_join':
+                sheets = rel_

src/utils/llm_utils.py CHANGED Viewed

@@ -9,6 +9,7 @@ from src.llms.falcon_llm import FalconLanguageModel
 from src.llms.llama_llm import LlamaLanguageModel
 from src.embeddings.huggingface_embedding import HuggingFaceEmbedding
 from src.vectorstores.chroma_vectorstore import ChromaVectorStore
 from src.utils.logger import logger
 from config.config import settings
@@ -39,21 +40,22 @@ def get_llm_instance(provider: str):
 async def get_vector_store() -> Tuple[ChromaVectorStore, HuggingFaceEmbedding]:
     """
-    Initialize and return vector store with embedding model.
     Returns:
-        Tuple[ChromaVectorStore, HuggingFaceEmbedding]: Initialized vector store and embedding model
-    Raises:
-        HTTPException: If vector store initialization fails
     """
     try:
         embedding = HuggingFaceEmbedding(model_name=settings.EMBEDDING_MODEL)
         vector_store = ChromaVectorStore(
             embedding_function=embedding.embed_documents,
             persist_directory=settings.CHROMA_PATH
         )
-        return vector_store, embedding
-    except Exception as e:
-        logger.error(f"Error initializing vector store: {str(e)}")
-        raise HTTPException(status_code=500, detail="Failed to initialize vector store")

 from src.llms.llama_llm import LlamaLanguageModel
 from src.embeddings.huggingface_embedding import HuggingFaceEmbedding
 from src.vectorstores.chroma_vectorstore import ChromaVectorStore
+from src.vectorstores.optimized_vectorstore import get_optimized_vector_store
 from src.utils.logger import logger
 from config.config import settings
 async def get_vector_store() -> Tuple[ChromaVectorStore, HuggingFaceEmbedding]:
     """
+    Get vector store and embedding model instances
+    Uses optimized implementation while maintaining backward compatibility
     Returns:
+        Tuple[ChromaVectorStore, HuggingFaceEmbedding]:
+            Vector store and embedding model instances
     """
     try:
+        return await get_optimized_vector_store()
+    except Exception as e:
+        logger.error(f"Error getting optimized vector store: {str(e)}")
+        # Fallback to original implementation if optimization fails
+        logger.warning("Falling back to standard vector store implementation")
         embedding = HuggingFaceEmbedding(model_name=settings.EMBEDDING_MODEL)
         vector_store = ChromaVectorStore(
             embedding_function=embedding.embed_documents,
             persist_directory=settings.CHROMA_PATH
         )
+        return vector_store, embedding

src/vectorstores/__pycache__/optimized_vectorstore.cpython-312.pyc ADDED Viewed

Binary file (6.85 kB). View file

src/vectorstores/optimized_vectorstore.py ADDED Viewed

	@@ -0,0 +1,137 @@

+# src/vectorstores/optimized_vectorstore.py
+import asyncio
+from typing import Tuple, Optional, List, Dict, Any, Callable
+import concurrent.futures
+from functools import lru_cache
+from .base_vectorstore import BaseVectorStore
+from .chroma_vectorstore import ChromaVectorStore
+from src.embeddings.huggingface_embedding import HuggingFaceEmbedding
+from src.utils.logger import logger
+from config.config import settings
+class OptimizedVectorStore(ChromaVectorStore):
+    """
+    Optimized vector store that maintains ChromaVectorStore compatibility
+    while adding caching and async initialization
+    """
+    _instance: Optional['OptimizedVectorStore'] = None
+    _lock = asyncio.Lock()
+    _initialized = False
+    _embedding_model: Optional[HuggingFaceEmbedding] = None
+    _executor = concurrent.futures.ThreadPoolExecutor(max_workers=1)
+    def __new__(cls, *args, **kwargs):
+        if not cls._instance:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+    def __init__(
+        self,
+        embedding_function: Optional[Callable] = None,
+        persist_directory: str = settings.CHROMA_PATH,
+        collection_name: str = "documents",
+        client_settings: Optional[Dict[str, Any]] = None
+    ):
+        """
+        Initialize the optimized vector store
+        Note: The actual initialization is deferred until needed
+        """
+        if not self._initialized:
+            self._persist_directory = persist_directory
+            self._collection_name = collection_name
+            self._client_settings = client_settings
+            self._embedding_function = embedding_function
+            # Don't call super().__init__() here - we'll do it in _initialize()
+    @classmethod
+    async def create(
+        cls,
+        persist_directory: str = settings.CHROMA_PATH,
+        collection_name: str = "documents",
+        client_settings: Optional[Dict[str, Any]] = None
+    ) -> Tuple['OptimizedVectorStore', HuggingFaceEmbedding]:
+        """
+        Asynchronously create or get instance
+        Returns:
+            Tuple[OptimizedVectorStore, HuggingFaceEmbedding]:
+                The vector store instance and embedding model
+        """
+        async with cls._lock:
+            if not cls._instance or not cls._initialized:
+                instance = cls(
+                    persist_directory=persist_directory,
+                    collection_name=collection_name,
+                    client_settings=client_settings
+                )
+                await instance._initialize()
+                cls._instance = instance
+            return cls._instance, cls._instance._embedding_model
+    async def _initialize(self) -> None:
+        """Initialize the vector store and embedding model"""
+        if self._initialized:
+            return
+        try:
+            # Load embedding model in background thread
+            self._embedding_model = await self._load_embedding_model()
+            # Initialize ChromaVectorStore with the loaded model
+            super().__init__(
+                embedding_function=self._embedding_model.embed_documents,
+                persist_directory=self._persist_directory,
+                collection_name=self._collection_name,
+                client_settings=self._client_settings
+            )
+            self._initialized = True
+        except Exception as e:
+            logger.error(f"Error initializing vector store: {str(e)}")
+            raise
+    async def _load_embedding_model(self) -> HuggingFaceEmbedding:
+        """Load embedding model in background thread"""
+        try:
+            loop = asyncio.get_event_loop()
+            return await loop.run_in_executor(
+                self._executor,
+                self._create_embedding_model
+            )
+        except Exception as e:
+            logger.error(f"Error loading embedding model: {str(e)}")
+            raise
+    @staticmethod
+    @lru_cache(maxsize=1)
+    def _create_embedding_model() -> HuggingFaceEmbedding:
+        """Create and cache embedding model"""
+        return HuggingFaceEmbedding(model_name=settings.EMBEDDING_MODEL)
+    def __getattribute__(self, name):
+        """
+        Ensure initialization before accessing any ChromaVectorStore methods
+        """
+        # Get the attribute from the class
+        attr = super().__getattribute__(name)
+        # If it's a method from ChromaVectorStore, ensure initialization
+        if callable(attr) and name in ChromaVectorStore.__dict__:
+            if not self._initialized:
+                raise RuntimeError(
+                    "Vector store not initialized. Please use 'await OptimizedVectorStore.create()'"
+                )
+        return attr
+# Factory function for getting optimized vector store
+async def get_optimized_vector_store() -> Tuple[ChromaVectorStore, HuggingFaceEmbedding]:
+    """
+    Get or create an optimized vector store instance
+    Returns:
+        Tuple[ChromaVectorStore, HuggingFaceEmbedding]:
+            The vector store and embedding model instances
+    """
+    return await OptimizedVectorStore.create()