Spaces:

TalatMasud
/

chatbot-backend

Running

App Files Files Community

TalatMasood commited on Feb 12

Commit

aee2bfd

1 Parent(s): b953016

Implementation for Google drive is done. Now it is working fine, except need a new function that can delete the chunks if a document is removed from google drive.

Browse files

Files changed (25) hide show

client_secret_1048685176388-r728705k0ef26t09rblffue3cqeaaco1.apps.googleusercontent.com.json +1 -0
config/__pycache__/config.cpython-312.pyc +0 -0
config/config.py +29 -0
service_account.json +13 -0
src/__pycache__/main.cpython-312.pyc +0 -0
src/agents/__pycache__/excel_aware_rag.cpython-312.pyc +0 -0
src/agents/__pycache__/rag_agent.cpython-312.pyc +0 -0
src/agents/__pycache__/system_instructions_rag.cpython-312.pyc +0 -0
src/agents/system_instructions_rag.py +216 -0
src/db/__pycache__/mongodb_store.cpython-312.pyc +0 -0
src/main.py +109 -8
src/models/__pycache__/UserContact.cpython-312.pyc +0 -0
src/models/__pycache__/chat.cpython-312.pyc +0 -0
src/models/__pycache__/rag.cpython-312.pyc +0 -0
src/utils/__pycache__/conversation_manager.cpython-312.pyc +0 -0
src/utils/__pycache__/database_cleanup.cpython-312.pyc +0 -0
src/utils/__pycache__/document_processor.cpython-312.pyc +0 -0
src/utils/__pycache__/drive_document_processor.cpython-312.pyc +0 -0
src/utils/__pycache__/enhanced_excel_processor.cpython-312.pyc +0 -0
src/utils/__pycache__/google_drive_service.cpython-312.pyc +0 -0
src/utils/__pycache__/llm_utils.cpython-312.pyc +0 -0
src/utils/drive_document_processor.py +315 -0
src/utils/google_drive_service.py +85 -0
src/vectorstores/__pycache__/optimized_vectorstore.cpython-312.pyc +0 -0
testfile.txt +0 -1

client_secret_1048685176388-r728705k0ef26t09rblffue3cqeaaco1.apps.googleusercontent.com.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"web":{"client_id":"1048685176388-r728705k0ef26t09rblffue3cqeaaco1.apps.googleusercontent.com","project_id":"demochatbotpropx","auth_uri":"https://accounts.google.com/o/oauth2/auth","token_uri":"https://oauth2.googleapis.com/token","auth_provider_x509_cert_url":"https://www.googleapis.com/oauth2/v1/certs","client_secret":"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","redirect_uris":["http://127.0.0.1:8000/google/oauth2callback"],"javascript_origins":["http://localhost:8000"]}}

config/__pycache__/config.cpython-312.pyc CHANGED Viewed

Binary files a/config/__pycache__/config.cpython-312.pyc and b/config/__pycache__/config.cpython-312.pyc differ

config/config.py CHANGED Viewed

@@ -1,6 +1,8 @@
 # config/config.py
 import os
 from dotenv import load_dotenv
 # Load environment variables
 load_dotenv()
@@ -31,7 +33,34 @@ class Settings:
     # Feedback Configuration
     MAX_RATING = int(os.getenv('MAX_RATING', '5'))
     # Application Configuration
     DEBUG = os.getenv('DEBUG', 'False') == 'True'
 settings = Settings()

 # config/config.py
 import os
 from dotenv import load_dotenv
+from google.oauth2.credentials import Credentials
+from google_auth_oauthlib.flow import Flow
 # Load environment variables
 load_dotenv()
     # Feedback Configuration
     MAX_RATING = int(os.getenv('MAX_RATING', '5'))
+    # Temporary directory for downloaded files
+    TEMP_DOWNLOAD_DIR = os.getenv('TEMP_DOWNLOAD_DIR', './temp_downloads')
     # Application Configuration
     DEBUG = os.getenv('DEBUG', 'False') == 'True'
+    # Google Drive Configuration
+    GOOGLE_DRIVE_FOLDER_ID=os.getenv('GOOGLE_DRIVE_FOLDER_ID', '')
+    GOOGLE_SERVICE_ACCOUNT_PATH = os.getenv('GOOGLE_SERVICE_ACCOUNT_PATH', 'service_account.json')
+    # GOOGLE_DRIVE_FOLDER_ID = os.getenv('GOOGLE_DRIVE_FOLDER_ID', '')
+    # GOOGLE_OAUTH_CLIENT_ID = os.getenv('GOOGLE_OAUTH_CLIENT_ID', '')
+    # GOOGLE_OAUTH_CLIENT_SECRET = os.getenv('GOOGLE_OAUTH_CLIENT_SECRET', '')
+    # GOOGLE_OAUTH_REDIRECT_URI = os.getenv('GOOGLE_OAUTH_REDIRECT_URI', 'http://127.0.0.1:8000/google/oauth2callback')
+    # @property
+    # def google_oauth_flow(self):
+    #     flow = Flow.from_client_config({
+    #         "web": {
+    #             "client_id": self.GOOGLE_OAUTH_CLIENT_ID,
+    #             "client_secret": self.GOOGLE_OAUTH_CLIENT_SECRET,
+    #             "auth_uri": "https://accounts.google.com/o/oauth2/auth",
+    #             "token_uri": "https://oauth2.googleapis.com/token",
+    #             "redirect_uris": [self.GOOGLE_OAUTH_REDIRECT_URI],
+    #             "javascript_origins": ["http://localhost:8000", "http://127.0.0.1:8000"]
+    #         }
+    #     }, scopes=['https://www.googleapis.com/auth/drive.readonly'])
+    #     flow.redirect_uri = self.GOOGLE_OAUTH_REDIRECT_URI
+    #     return flow
 settings = Settings()

service_account.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "type": "service_account",
+  "project_id": "demochatbotpropx",
+  "private_key_id": "7afaa51c4fc75d0d25668e84032d12622408356e",
+  "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQC9Or6H3z2BSL24\nfc41WTxUmmhzXUggT8cM0VB/5PTMat8j/vmVhia7mwpMnNEU5f4xcwuF0Lkhjsj2\nCryliph3dPxPUaVCy7aFBatl6kjvf9lBh3bTtIXtONjBN9w2UEqOc85zTpBJxqrT\nNb50VUnZhEDeWAcvB5L3l5+EZxlKUN6HDCdadhLWK4MMuk9XCwQvfqlWR/kT8TJd\nVa408Q4sEoE5F0w5g2epPGBCbCN/FonFq1Mp1Q+kUDmJvdVma4JVBVHSDrqszKum\n/9UPREIEIWar5FPkIVFDwhpKTIWKuwwfdSfbuchVihv1POYksx8NebzxJigP4TWG\nE+QuPpodAgMBAAECggEAC3DMfMkakVrCA0nL5Veehg9XbR4nLBjwRzScG6Q0+4Tv\nz8Y5j18IfKnStUlH4aEUI/Sfx9JHClUOwZCdnjT81qZ8HxmOSc8PAaGs0eEYnsfj\nU6RqiUGUiIA6DqVSpMM1XSgTdwI5em4B/WB2KHEEB+ju+RWbMMgFWTKefAu1sKBY\n1GvNw0QhiizqQN0q2CNSSPqDkPkb24YNTuX0i3XAt3lel3JFbv/SsRgX8NHbO0aU\nPlDleqQF6wWVu5wVmwbw8digXDcsPw/8gb/EqEfgaKLz0/bKd74nyKA/pGZnD+KN\n1pusV+iYymfaejFmSdRPMVq0NfWqRLRw8i7T1xnYmwKBgQD4+cPaeVt1kCqXtCEm\nhRofAa9rPnhkHb44pUffalYgdLi9vT31H1bHh9D8leN4Df1sLQrTttxa7QaVIPfX\nE64yAShyu603nmZ/GUAvtUy54lB76SOjHMWNhbCyEzDcOYSkjH3SedlSYX1xqYNl\nN0GUFeaBDS1PUyyYTSN2rZRYpwKBgQDCkXXgYirMaw2jksij2Ru8ZdT/Dg54YhHQ\n6+/xV+K+MCnt6vw3qloatJzOdXEnulndjl71d8WVYlpBkHzHEaLy3XgA6FGtwq6J\nvKS1w/FZcDi2ra0RFtAi844/HUftbB1ZBSe93nxHNw10XRzFHtFc6OPFcFfOmhgJ\nnYLbH2xLmwKBgBtD9u/RBHQOcqukXVEDmIW2wIglEjgcjb1UVFeiJIZvYd/dfpB+\nexlkxT00CPIXzh3vnNTsnJsUg/kG4D1ceWIegFh4NxL1NNJMaJwQ5bMhlqDLOkzd\nlMDX2C7YLSyg2+bNP+Yx09vSs1MkNjB6aaMW9uRBFiouuJ6BLBYOEkXXAoGBAJp3\n3Tuc9BmCTDu2xu+959U0i1tKj5ZnVXmmNsJGYc9YcZFfY4nWBt740RzgBEvkGIBb\nDWyYABdPFBTFXyq0B8gEp8cgqefnjaXwTFu6ChxVidEOJT5R/EAjWKUm2/nUQaBx\nBVIqFkR7ooTlf3fHtbOreVlAjZWKpNbNZBwO4G1NAoGAU3HX5fyC+0OWMi22cIha\nyUivxH8ti6JmfNJllRr5V3bHHqKrLCPC8tgAZouvutendjm3beNhTeQGoY2QsNhI\nF5NC8euwpYMLhhwVvTB0G6sBxplZZ3FUNMUtpFQ0qvb6VYNQOa2+qBagemhEPhxs\nBfQH0FnqRjBjGoaX7nw6FnU=\n-----END PRIVATE KEY-----\n",
+  "client_email": "[email protected]",
+  "client_id": "108855684700262853537",
+  "auth_uri": "https://accounts.google.com/o/oauth2/auth",
+  "token_uri": "https://oauth2.googleapis.com/token",
+  "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
+  "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/demochatbotpropx%40demochatbotpropx.iam.gserviceaccount.com",
+  "universe_domain": "googleapis.com"
+}

src/__pycache__/main.cpython-312.pyc CHANGED Viewed

Binary files a/src/__pycache__/main.cpython-312.pyc and b/src/__pycache__/main.cpython-312.pyc differ

src/agents/__pycache__/excel_aware_rag.cpython-312.pyc CHANGED Viewed

Binary files a/src/agents/__pycache__/excel_aware_rag.cpython-312.pyc and b/src/agents/__pycache__/excel_aware_rag.cpython-312.pyc differ

src/agents/__pycache__/rag_agent.cpython-312.pyc CHANGED Viewed

Binary files a/src/agents/__pycache__/rag_agent.cpython-312.pyc and b/src/agents/__pycache__/rag_agent.cpython-312.pyc differ

src/agents/__pycache__/system_instructions_rag.cpython-312.pyc ADDED Viewed

Binary file (8.58 kB). View file

src/agents/system_instructions_rag.py ADDED Viewed

	@@ -0,0 +1,216 @@

+# src/agents/system_instructions_rag.py
+from typing import List, Dict, Optional
+from src.agents.rag_agent import RAGResponse
+from src.utils.logger import logger
+from src.agents.rag_agent import RAGAgent
+class SystemInstructionsRAGAgent(RAGAgent):
+    """RAG Agent with enhanced system instructions for specific use cases"""
+    async def generate_response(
+        self,
+        query: str,
+        conversation_id: Optional[str] = None,
+        temperature: float = 0.7,
+        max_tokens: Optional[int] = None,
+        context_docs: Optional[List[str]] = None
+    ) -> RAGResponse:
+        """
+        Generate response with specific handling for introduction and no-context cases
+        """
+        try:
+            # First, check if this is an introduction/welcome message query
+            is_introduction = (
+                "wants support" in query and
+                "This is Introduction" in query and
+                ("A new user with name:" in query or "An old user with name:" in query)
+            )
+            if is_introduction:
+                # Handle introduction message - no context needed
+                welcome_message = self._handle_contact_query(query)
+                return RAGResponse(
+                    response=welcome_message,
+                    context_docs=[],
+                    sources=[],
+                    scores=None
+                )
+            # For all other queries, proceed with context-based response
+            if not context_docs:
+                context_docs, sources, scores = await self.retrieve_context(
+                    query,
+                    conversation_history=[]
+                )
+            # Check if we have relevant context
+            has_relevant_context = self._check_context_relevance(query, context_docs or [])
+            # If no relevant context found, return the standard message
+            if not has_relevant_context:
+                return RAGResponse(
+                    response="Information about this is not available, do you want to inquire about something else?",
+                    context_docs=[],
+                    sources=[],
+                    scores=None
+                )
+            # Generate response using context
+            prompt = self._create_response_prompt(query, context_docs)
+            response_text = self.llm.generate(
+                prompt,
+                temperature=temperature,
+                max_tokens=max_tokens
+            )
+            # Check if the generated response indicates no information
+            cleaned_response = self._clean_response(response_text)
+            if self._is_no_info_response(cleaned_response):
+                return RAGResponse(
+                    response="Information about this is not available, do you want to inquire about something else?",
+                    context_docs=[],
+                    sources=[],
+                    scores=None
+                )
+            return RAGResponse(
+                response=cleaned_response,
+                context_docs=context_docs,
+                sources=sources,
+                scores=scores
+            )
+        except Exception as e:
+            logger.error(f"Error in SystemInstructionsRAGAgent: {str(e)}")
+            raise
+    def _is_no_info_response(self, response: str) -> bool:
+        """Check if the response indicates no information available"""
+        no_info_indicators = [
+            "i do not have",
+            "i don't have",
+            "no information",
+            "not available",
+            "could not find",
+            "couldn't find",
+            "cannot find"
+        ]
+        response_lower = response.lower()
+        return any(indicator in response_lower for indicator in no_info_indicators)
+    def _check_context_relevance(self, query: str, context_docs: List[str]) -> bool:
+        """Check if context contains information relevant to the query"""
+        if not context_docs:
+            return False
+        # Extract key terms from query
+        query_words = query.lower().split()
+        stop_words = {'share', 'me', 'a', 'about', 'information', 'what', 'is', 'are', 'the', 'in', 'how', 'why', 'when', 'where'}
+        query_terms = {word for word in query_words if word not in stop_words}
+        # Check each context document for relevance
+        for doc in context_docs:
+            if not doc:
+                continue
+            doc_lower = doc.lower()
+            if any(term in doc_lower for term in query_terms):
+                # Found relevant content
+                return True
+        return False
+    def _create_response_prompt(self, query: str, context_docs: List[str]) -> str:
+        """Create prompt for generating response from context"""
+        formatted_context = '\n\n'.join(
+            f"Context {i+1}:\n{doc.strip()}"
+            for i, doc in enumerate(context_docs)
+            if doc and doc.strip()
+        )
+        return f"""
+Use ONLY the following context to provide information about: {query}
+{formatted_context}
+Instructions:
+1. Use ONLY information present in the context above
+2. If the information is found in the context, provide a direct and concise response
+3. Do not make assumptions or add information not present in the context
+4. Ensure the response is clear and complete based on available information
+5. If you cannot find relevant information about the specific query in the context,
+   respond exactly with: "Information about this is not available, do you want to inquire about something else?"
+Query: {query}
+Response:"""
+    def _handle_contact_query(self, query: str) -> str:
+        """Handle queries from /user/contact endpoint"""
+        try:
+            name_start = query.find('name: "') + 7
+            name_end = query.find('"', name_start)
+            name = query[name_start:name_end] if name_start > 6 and name_end != -1 else "there"
+            is_returning = (
+                "An old user with name:" in query and
+                "wants support again" in query
+            )
+            if is_returning:
+                return f"Welcome back {name}, How can I help you?"
+            return f"Welcome {name}, How can I help you?"
+        except Exception as e:
+            logger.error(f"Error handling contact query: {str(e)}")
+            return "Welcome, How can I help you?"
+    def _clean_response(self, response: str) -> str:
+        """Clean response by removing unwanted phrases"""
+        if not response:
+            return response
+        phrases_to_remove = [
+            "Based on the context provided,",
+            "According to the documents,",
+            "From the information available,",
+            "I can tell you that",
+            "Let me help you with that",
+            "I understand you're asking about",
+            "To answer your question,",
+            "The documents indicate that",
+            "Based on the available information,",
+            "As per the provided context,",
+            "I would be happy to help you with that",
+            "Let me provide you with information about",
+            "Here's what I found:",
+            "Here's the information you requested:",
+            "According to the provided information,",
+            "Based on the documents,",
+            "The information suggests that",
+            "From what I can see,",
+            "Let me explain",
+            "To clarify,",
+            "It appears that",
+            "I can see that",
+            "Sure,",
+            "Well,",
+            "Based on the given context,",
+            "The available information shows that",
+            "From the context provided,",
+            "The documentation mentions that",
+            "According to the context,",
+            "As shown in the context,",
+            "I apologize,"
+        ]
+        cleaned_response = response
+        for phrase in phrases_to_remove:
+            cleaned_response = cleaned_response.replace(phrase, "").strip()
+        cleaned_response = " ".join(cleaned_response.split())
+        if not cleaned_response:
+            return response
+        if cleaned_response[0].islower():
+            cleaned_response = cleaned_response[0].upper() + cleaned_response[1:]
+        return cleaned_response

src/db/__pycache__/mongodb_store.cpython-312.pyc CHANGED Viewed

Binary files a/src/db/__pycache__/mongodb_store.cpython-312.pyc and b/src/db/__pycache__/mongodb_store.cpython-312.pyc differ

src/main.py CHANGED Viewed

@@ -8,12 +8,23 @@ import uuid
 from datetime import datetime
 from pathlib import Path
 import os
 # Import custom modules1
-from src.agents.rag_agent import RAGAgent
 from src.models.document import AllDocumentsResponse, StoredDocument
 from src.models.UserContact import UserContactRequest
 from src.utils.document_processor import DocumentProcessor
 from src.utils.conversation_summarizer import ConversationSummarizer
 from src.utils.logger import logger
 from src.utils.llm_utils import get_llm_instance, get_vector_store
@@ -43,6 +54,8 @@ app.add_middleware(
     allow_headers=["*"],  # Allows all headers
 )
 # Initialize MongoDB
 mongodb = MongoDBStore(settings.MONGODB_URI)
@@ -70,6 +83,43 @@ async def verify_api_key(api_key: str = Depends(API_KEY_HEADER)):
         )
     return api_key
 @app.get("/documents")
 async def get_all_documents():
     """Get all documents from MongoDB"""
@@ -147,7 +197,8 @@ async def upload_documents(
         logger.error(f"Error in document upload: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
-@app.get("/documentchunks/{document_id}")
 async def get_document_chunks(document_id: str):
     """Get all chunks for a specific document"""
     try:
@@ -207,8 +258,31 @@ async def delete_document(document_id: str):
         logger.error(f"Error in delete_document endpoint: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
-# src/main.py
 @app.post("/user/contact", response_model=ChatResponse)
 async def create_user_contact(
     request: UserContactRequest,
@@ -224,7 +298,7 @@ async def create_user_contact(
         if existing_conversation_id:
             chat_request = ChatRequest(
-                query=f'An old user with name: "{request.full_name}", email: "{request.email}" and phone number: "{request.phone_number}" wants support again. Create a welcome back message for him and ask how i can help you today?',
                 llm_provider="openai",
                 max_context_docs=3,
                 temperature=1.0,
@@ -242,7 +316,7 @@ async def create_user_contact(
             )
             chat_request = ChatRequest(
-                query=f'A new user with name: "{request.full_name}", email: "{request.email}" and phone number: "{request.phone_number}" wants support. Create a welcome message for him and ask how i can help you today?',
                 llm_provider="openai",
                 max_context_docs=3,
                 temperature=1.0,
@@ -272,13 +346,40 @@ async def chat_endpoint(
         llm = get_llm_instance(request.llm_provider)
         # Initialize RAG agent
-        rag_agent = RAGAgent(
             llm=llm,
             embedding=embedding_model,
             vector_store=vector_store,
             mongodb=mongodb
         )
         # Use provided conversation ID or create new one
         conversation_id = request.conversation_id or str(uuid.uuid4())
@@ -287,7 +388,7 @@ async def chat_endpoint(
         # Add specific instructions for certain types of queries
         #if "introduce" in query.lower() or "name" in query.lower() or "email" in query.lower():
-        query += ". The response should be short and to the point. Make sure to not add any irrelevant information. Keep the introduction concise and friendly."
         # Generate response
         logger.info(f"Generating response: {str(datetime.now())}")

 from datetime import datetime
 from pathlib import Path
 import os
+import asyncio
+os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1'
+#os.environ["OAUTHLIB_RELAX_TOKEN_SCOPE"] = "1"
+from fastapi.responses import RedirectResponse
+from google.oauth2.credentials import Credentials
+from google_auth_oauthlib.flow import Flow
+from src.utils.google_drive_service import GoogleDriveService
 # Import custom modules1
+#from src.agents.rag_agent import RAGAgent
+from src.agents.system_instructions_rag import SystemInstructionsRAGAgent
 from src.models.document import AllDocumentsResponse, StoredDocument
 from src.models.UserContact import UserContactRequest
 from src.utils.document_processor import DocumentProcessor
+from src.utils.drive_document_processor import DriveDocumentProcessor
 from src.utils.conversation_summarizer import ConversationSummarizer
 from src.utils.logger import logger
 from src.utils.llm_utils import get_llm_instance, get_vector_store
     allow_headers=["*"],  # Allows all headers
 )
+#google_drive_service = GoogleDriveService()
 # Initialize MongoDB
 mongodb = MongoDBStore(settings.MONGODB_URI)
         )
     return api_key
+# @app.get("/google/auth")
+# async def google_auth():
+#     authorization_url, _ = settings.google_oauth_flow.authorization_url(
+#         access_type='offline',
+#         prompt='consent',
+#         include_granted_scopes='true'
+#     )
+#     return RedirectResponse(authorization_url)
+# @app.get("/google/oauth2callback")
+# async def google_auth_callback(code: str):
+#     flow = Flow.from_client_config({
+#         "web": {
+#             "client_id": settings.GOOGLE_OAUTH_CLIENT_ID,
+#             "client_secret": settings.GOOGLE_OAUTH_CLIENT_SECRET,
+#             "auth_uri": "https://accounts.google.com/o/oauth2/auth",
+#             "token_uri": "https://oauth2.googleapis.com/token",
+#             "redirect_uris": [settings.GOOGLE_OAUTH_REDIRECT_URI]
+#         }
+#     }, scopes=['https://www.googleapis.com/auth/drive.readonly'])
+#     flow.redirect_uri = settings.GOOGLE_OAUTH_REDIRECT_URI
+#     # Add access type and prompt parameters for refresh token
+#     flow.fetch_token(
+#         code=code,
+#         access_type='offline',
+#         prompt='consent'
+#     )
+#     credentials = flow.credentials
+#     return {
+#         "message": "Authentication successful",
+#         "credentials": credentials.to_json()
+#     }
 @app.get("/documents")
 async def get_all_documents():
     """Get all documents from MongoDB"""
         logger.error(f"Error in document upload: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
+@app.get("/documentChunks/{document_id}")
 async def get_document_chunks(document_id: str):
     """Get all chunks for a specific document"""
     try:
         logger.error(f"Error in delete_document endpoint: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
+@app.post("/processDriveDocuments")
+async def process_drive_documents():
+    try:
+        # Initialize vector store
+        vector_store, _ = await get_vector_store()
+        # Initialize Drive document processor
+        drive_processor = DriveDocumentProcessor(
+            google_service_account_path=settings.GOOGLE_SERVICE_ACCOUNT_PATH,
+            folder_id=settings.GOOGLE_DRIVE_FOLDER_ID,
+            temp_dir=settings.TEMP_DOWNLOAD_DIR,
+            doc_processor=doc_processor
+        )
+        # Process documents
+        result = await drive_processor.process_documents(vector_store)
+        return result
+    except Exception as e:
+        logger.error(f"Error in process_drive_documents: {str(e)}")
+        raise HTTPException(
+            status_code=500,
+            detail=str(e)
+        )
 @app.post("/user/contact", response_model=ChatResponse)
 async def create_user_contact(
     request: UserContactRequest,
         if existing_conversation_id:
             chat_request = ChatRequest(
+                query=f'An old user with name: "{request.full_name}", email: "{request.email}" and phone number: "{request.phone_number}" wants support again. This is Introduction Create a welcome back message for him and ask how i can help you today?',
                 llm_provider="openai",
                 max_context_docs=3,
                 temperature=1.0,
             )
             chat_request = ChatRequest(
+                query=f'A new user with name: "{request.full_name}", email: "{request.email}" and phone number: "{request.phone_number}" wants support. This is Introduction Create a welcome message for him and ask how i can help you today?',
                 llm_provider="openai",
                 max_context_docs=3,
                 temperature=1.0,
         llm = get_llm_instance(request.llm_provider)
         # Initialize RAG agent
+        # rag_agent = RAGAgent(
+        #     llm=llm,
+        #     embedding=embedding_model,
+        #     vector_store=vector_store,
+        #     mongodb=mongodb
+        # )
+        rag_agent = SystemInstructionsRAGAgent(
             llm=llm,
             embedding=embedding_model,
             vector_store=vector_store,
             mongodb=mongodb
         )
+        # rag_agent.add_custom_role(
+        #     "Knowledge based chatbot and introduction specialist",
+        #     """You are a welcome agent with knowledge based specialist focusing on knowledge attached and create a beautiful welcome message.
+        #     Your role is to:
+        #     1. Your response should be short and to the point.
+        #     2. Strictly follow this point for If it is an introduction. You strictly respond that "Welcome name of customer to our platform. How can I help you today?"
+        #     """
+        # )
+        # rag_agent.add_custom_role(
+        #     "Knowledge based chatbot",
+        #     """You are a knowledge based specialist focusing on knowledge attached.
+        #     Your role is to:
+        #     1. Your response should be short and to the point.
+        #     2. if it is not introduction then make sure to share the response from Vector store.
+        #     3. If you do not find relevant information. Just say I do not have this information but this do not apply to introduction message.
+        #     4. If there is an introduction, you should ignore above roles and connect with LLm to have a welcome message for the user.
+        #     """
+        # )
         # Use provided conversation ID or create new one
         conversation_id = request.conversation_id or str(uuid.uuid4())
         # Add specific instructions for certain types of queries
         #if "introduce" in query.lower() or "name" in query.lower() or "email" in query.lower():
+        #query += ". The response should be short and to the point. Make sure to not add any irrelevant information. make sure to share the response from Vector store, if you do not find information in vector store. Just respond I do not have information. Keep the introduction concise and friendly."
         # Generate response
         logger.info(f"Generating response: {str(datetime.now())}")

src/models/__pycache__/UserContact.cpython-312.pyc CHANGED Viewed

Binary files a/src/models/__pycache__/UserContact.cpython-312.pyc and b/src/models/__pycache__/UserContact.cpython-312.pyc differ

src/models/__pycache__/chat.cpython-312.pyc CHANGED Viewed

Binary files a/src/models/__pycache__/chat.cpython-312.pyc and b/src/models/__pycache__/chat.cpython-312.pyc differ

src/models/__pycache__/rag.cpython-312.pyc CHANGED Viewed

Binary files a/src/models/__pycache__/rag.cpython-312.pyc and b/src/models/__pycache__/rag.cpython-312.pyc differ

src/utils/__pycache__/conversation_manager.cpython-312.pyc CHANGED Viewed

Binary files a/src/utils/__pycache__/conversation_manager.cpython-312.pyc and b/src/utils/__pycache__/conversation_manager.cpython-312.pyc differ

src/utils/__pycache__/database_cleanup.cpython-312.pyc CHANGED Viewed

Binary files a/src/utils/__pycache__/database_cleanup.cpython-312.pyc and b/src/utils/__pycache__/database_cleanup.cpython-312.pyc differ

src/utils/__pycache__/document_processor.cpython-312.pyc CHANGED Viewed

Binary files a/src/utils/__pycache__/document_processor.cpython-312.pyc and b/src/utils/__pycache__/document_processor.cpython-312.pyc differ

src/utils/__pycache__/drive_document_processor.cpython-312.pyc ADDED Viewed

Binary file (10.8 kB). View file

src/utils/__pycache__/enhanced_excel_processor.cpython-312.pyc CHANGED Viewed

Binary files a/src/utils/__pycache__/enhanced_excel_processor.cpython-312.pyc and b/src/utils/__pycache__/enhanced_excel_processor.cpython-312.pyc differ

src/utils/__pycache__/google_drive_service.cpython-312.pyc ADDED Viewed

Binary file (3.94 kB). View file

src/utils/__pycache__/llm_utils.cpython-312.pyc CHANGED Viewed

Binary files a/src/utils/__pycache__/llm_utils.cpython-312.pyc and b/src/utils/__pycache__/llm_utils.cpython-312.pyc differ

src/utils/drive_document_processor.py ADDED Viewed

	@@ -0,0 +1,315 @@

+# src/utils/drive_document_processor.py
+from pathlib import Path
+from typing import Dict, List, Any, Tuple
+import logging
+from fastapi import HTTPException
+from src.utils.google_drive_service import GoogleDriveService
+from src.utils.document_processor import DocumentProcessor
+from src.vectorstores.chroma_vectorstore import ChromaVectorStore
+from src.utils.logger import logger
+class DriveDocumentProcessor:
+    def __init__(
+        self,
+        google_service_account_path: str,
+        folder_id: str,
+        temp_dir: str,
+        doc_processor: DocumentProcessor
+    ):
+        """
+        Initialize Drive Document Processor
+        Args:
+            google_service_account_path (str): Path to Google service account credentials
+            folder_id (str): Google Drive folder ID to process
+            temp_dir (str): Directory for temporary files
+            doc_processor (DocumentProcessor): Instance of DocumentProcessor
+        """
+        self.google_drive_service = GoogleDriveService(google_service_account_path)
+        self.folder_id = folder_id
+        self.temp_dir = Path(temp_dir)
+        self.doc_processor = doc_processor
+        # Create temp directory if it doesn't exist
+        self.temp_dir.mkdir(exist_ok=True)
+        # Define supported MIME types
+        self.supported_mime_types = {
+            # Google Docs
+            'application/vnd.google-apps.document': '.docx',  # Export Google Docs as DOCX
+            # Microsoft Word Documents
+            'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
+            'application/msword': '.doc',
+            # Microsoft Excel Documents
+            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
+            'application/vnd.ms-excel': '.xls',
+            # Text Documents
+            'text/plain': '.txt',
+            'text/csv': '.csv',
+            'text/markdown': '.md',
+            'text/html': '.html',
+            'text/xml': '.xml',
+            'application/json': '.json',
+            'application/rtf': '.rtf',
+            # PDF Documents
+            'application/pdf': '.pdf'
+        }
+        # Define export MIME types for Google Docs formats
+        self.google_docs_export_types = {
+            'application/vnd.google-apps.document': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
+        }
+    async def process_documents(
+        self,
+        vector_store: ChromaVectorStore
+    ) -> Dict[str, Any]:
+        """
+        Process all documents in the specified Drive folder
+        Args:
+            vector_store (ChromaVectorStore): Vector store instance
+        Returns:
+            Dict[str, Any]: Processing results
+        """
+        try:
+            # Get documents from folder
+            files = self.google_drive_service.get_folder_contents(self.folder_id)
+            processed_files = []
+            skipped_files = []
+            errors = []
+            for file in files:
+                result = await self._process_single_file(file, vector_store)
+                if result['status'] == 'processed':
+                    processed_files.append(result['data'])
+                elif result['status'] == 'skipped':
+                    skipped_files.append(result['data'])
+                else:  # status == 'error'
+                    errors.append(result['data'])
+            # Clean up temporary directory if empty
+            self._cleanup_temp_dir()
+            return {
+                "status": "completed",
+                "processed_files": {
+                    "count": len(processed_files),
+                    "details": processed_files
+                },
+                "skipped_files": {
+                    "count": len(skipped_files),
+                    "details": skipped_files
+                },
+                "errors": {
+                    "count": len(errors),
+                    "details": errors
+                }
+            }
+        except Exception as e:
+            logger.error(f"Error processing Drive documents: {str(e)}")
+            raise HTTPException(
+                status_code=500,
+                detail=f"Failed to process drive documents: {str(e)}"
+            )
+    async def _process_single_file(
+        self,
+        file: Dict[str, Any],
+        vector_store: ChromaVectorStore
+    ) -> Dict[str, Any]:
+        """Process a single Drive file"""
+        mime_type = file.get('mimeType', '')
+        # Skip if mime type not supported
+        if mime_type not in self.supported_mime_types:
+            return {
+                'status': 'skipped',
+                'data': {
+                    'name': file['name'],
+                    'reason': f'Unsupported mime type: {mime_type}'
+                }
+            }
+        try:
+            document_id = file['id']
+            modified_time = file.get('modifiedTime', 'N/A')  # Get last modified time
+            # Check if document should be processed
+            if self.save_document(document_id, vector_store, modified_time):
+                # Download and process file
+                temp_file_path = await self._download_and_save_file(
+                    file['id'],
+                    mime_type
+                )
+                try:
+                    # Process document
+                    processed_doc = await self.doc_processor.process_document(
+                        str(temp_file_path)
+                    )
+                    # Add to vector store
+                    self._add_to_vector_store(
+                        processed_doc['chunks'],
+                        file,
+                        mime_type,
+                        vector_store
+                    )
+                    return {
+                        'status': 'processed',
+                        'data': {
+                            'name': file['name'],
+                            'id': file['id'],
+                            'chunks_processed': len(processed_doc['chunks'])
+                        }
+                    }
+                finally:
+                    # Clean up temporary file
+                    if temp_file_path.exists():
+                        temp_file_path.unlink()
+            else:
+                # Return skipped status if document already exists and is up to date
+                return {
+                    'status': 'skipped',
+                    'data': {
+                        'name': file['name'],
+                        'reason': 'Document already exists in the memory.'
+                    }
+                }
+        except Exception as e:
+            logger.error(f"Error processing file {file['name']}: {str(e)}")
+            return {
+                'status': 'error',
+                'data': {
+                    'file_name': file['name'],
+                    'error': str(e)
+                }
+            }
+        except Exception as e:
+            logger.error(f"Error processing file {file['name']}: {str(e)}")
+            return {
+                'status': 'error',
+                'data': {
+                    'file_name': file['name'],
+                    'error': str(e)
+                }
+            }
+    async def _download_and_save_file(
+        self,
+        file_id: str,
+        mime_type: str
+    ) -> Path:
+        """Download and save file to temporary location"""
+        extension = self.supported_mime_types[mime_type]
+        temp_file_path = self.temp_dir / f"{file_id}{extension}"
+        if mime_type in self.google_docs_export_types:
+            # Download Google Doc in the specified export format
+            content = self.google_drive_service.export_file(
+                file_id,
+                self.google_docs_export_types[mime_type]
+            )
+        else:
+            # Download regular file
+            content = self.google_drive_service.download_file(file_id)
+        with open(temp_file_path, 'wb') as f:
+            if isinstance(content, str):
+                f.write(content.encode('utf-8'))
+            else:
+                f.write(content)
+        return temp_file_path
+    def _add_to_vector_store(
+        self,
+        chunks: List[str],
+        file: Dict[str, Any],
+        mime_type: str,
+        vector_store: ChromaVectorStore
+    ) -> None:
+        """Add processed chunks to vector store"""
+        chunk_metadatas = []
+        chunk_ids = []
+        # document_id = file['id']
+        modified_time = file.get('modifiedTime', 'N/A')  # Get last modified time
+        #self.delete_updated_document(document_id, vector_store, modified_time)
+        for i, chunk in enumerate(chunks):
+            chunk_id = f"{file['id']}-chunk-{i}"
+            chunk_ids.append(chunk_id)
+            chunk_metadatas.append({
+                "source": file['name'],
+                "document_id": file['id'],
+                "chunk_index": i,
+                "mime_type": mime_type,
+                "modified_time": modified_time,
+                "total_chunks": len(chunks),
+                "file_type": self.supported_mime_types[mime_type],
+                "is_google_doc": mime_type.startswith('application/vnd.google-apps')
+            })
+        vector_store.add_documents(
+            documents=chunks,
+            metadatas=chunk_metadatas,
+            ids=chunk_ids
+        )
+    def save_document(self, document_id: str, vector_store: ChromaVectorStore, modified_date: str) -> bool:
+        """
+        Deletes all chunks of a document if the modified_time does not match the given modified_date.
+        Args:
+            document_id (str): The ID of the document.
+            vector_store (ChromaVectorStore): The Chroma vector store instance.
+            modified_date (str): The expected modification date.
+        """
+        try:
+            # Retrieve all chunks for the given document_id
+            chunks = vector_store.get_document_chunks(document_id)
+            if not chunks:
+                logging.warning(f"No chunks found for document_id: {document_id}. Nothing to delete.")
+                return True
+            # Check the modified_time of the first chunk
+            first_chunk_metadata = chunks[0].get("metadata", {})
+            if first_chunk_metadata.get("modified_time") != modified_date:
+                # If modified_time doesn't match, delete all chunks
+                vector_store.delete_document(document_id)
+                logging.info(f"Deleted all chunks for document_id: {document_id} due to modified_time mismatch.")
+                return True
+            else:
+                logging.info(f"No deletion needed for document_id: {document_id}, modified_time is unchanged.")
+                return False
+        except Exception as e:
+            logging.error(f"Error while deleting chunks for document_id {document_id}: {str(e)}")
+            return True
+    def _cleanup_temp_dir(self) -> None:
+        """Clean up temporary directory if empty"""
+        if self.temp_dir.exists() and not any(self.temp_dir.iterdir()):
+            self.temp_dir.rmdir()

src/utils/google_drive_service.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# src/utils/google_drive_service.py
+from google.oauth2 import service_account
+from googleapiclient.discovery import build
+from googleapiclient.http import MediaIoBaseDownload
+import io
+import os
+class GoogleDriveService:
+    def __init__(self, credentials_path: str):
+        """
+        Initialize Google Drive service
+        Args:
+            credentials_path (str): Path to service account credentials file
+        """
+        self.credentials = service_account.Credentials.from_service_account_file(
+            credentials_path,
+            scopes=['https://www.googleapis.com/auth/drive.readonly']
+        )
+        self.service = build('drive', 'v3', credentials=self.credentials)
+    def get_folder_contents(self, folder_id: str):
+        """
+        Get contents of a Drive folder
+        Args:
+            folder_id (str): ID of the folder to process
+        Returns:
+            List[Dict]: List of file metadata
+        """
+        query = f"'{folder_id}' in parents and trashed=false"
+        results = self.service.files().list(
+            q=query,
+            fields="files(id, name, mimeType,modifiedTime)",
+            supportsAllDrives=True,
+            includeItemsFromAllDrives=True
+        ).execute()
+        return results.get('files', [])
+    def download_file(self, file_id: str) -> bytes:
+        """
+        Download a file from Drive
+        Args:
+            file_id (str): ID of the file to download
+        Returns:
+            bytes: File content
+        """
+        request = self.service.files().get_media(fileId=file_id)
+        content = io.BytesIO()
+        downloader = MediaIoBaseDownload(content, request)
+        done = False
+        while not done:
+            _, done = downloader.next_chunk()
+        content.seek(0)
+        return content.read()
+    def export_file(self, file_id: str, mime_type: str) -> bytes:
+        """
+        Export a Google Workspace file to a different format
+        Args:
+            file_id (str): ID of the file to export
+            mime_type (str): MIME type to export to
+        Returns:
+            bytes: Exported file content
+        """
+        request = self.service.files().export_media(
+            fileId=file_id,
+            mimeType=mime_type
+        )
+        content = io.BytesIO()
+        downloader = MediaIoBaseDownload(content, request)
+        done = False
+        while not done:
+            _, done = downloader.next_chunk()
+        content.seek(0)
+        return content.read()

src/vectorstores/__pycache__/optimized_vectorstore.cpython-312.pyc CHANGED Viewed

Binary files a/src/vectorstores/__pycache__/optimized_vectorstore.cpython-312.pyc and b/src/vectorstores/__pycache__/optimized_vectorstore.cpython-312.pyc differ

testfile.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- testing123