TalatMasood commited on
Commit
aee2bfd
·
1 Parent(s): b953016

Implementation for Google drive is done. Now it is working fine, except need a new function that can delete the chunks if a document is removed from google drive.

Browse files
client_secret_1048685176388-r728705k0ef26t09rblffue3cqeaaco1.apps.googleusercontent.com.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"web":{"client_id":"1048685176388-r728705k0ef26t09rblffue3cqeaaco1.apps.googleusercontent.com","project_id":"demochatbotpropx","auth_uri":"https://accounts.google.com/o/oauth2/auth","token_uri":"https://oauth2.googleapis.com/token","auth_provider_x509_cert_url":"https://www.googleapis.com/oauth2/v1/certs","client_secret":"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","redirect_uris":["http://127.0.0.1:8000/google/oauth2callback"],"javascript_origins":["http://localhost:8000"]}}
config/__pycache__/config.cpython-312.pyc CHANGED
Binary files a/config/__pycache__/config.cpython-312.pyc and b/config/__pycache__/config.cpython-312.pyc differ
 
config/config.py CHANGED
@@ -1,6 +1,8 @@
1
  # config/config.py
2
  import os
3
  from dotenv import load_dotenv
 
 
4
 
5
  # Load environment variables
6
  load_dotenv()
@@ -31,7 +33,34 @@ class Settings:
31
  # Feedback Configuration
32
  MAX_RATING = int(os.getenv('MAX_RATING', '5'))
33
 
 
 
 
34
  # Application Configuration
35
  DEBUG = os.getenv('DEBUG', 'False') == 'True'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  settings = Settings()
 
1
  # config/config.py
2
  import os
3
  from dotenv import load_dotenv
4
+ from google.oauth2.credentials import Credentials
5
+ from google_auth_oauthlib.flow import Flow
6
 
7
  # Load environment variables
8
  load_dotenv()
 
33
  # Feedback Configuration
34
  MAX_RATING = int(os.getenv('MAX_RATING', '5'))
35
 
36
+ # Temporary directory for downloaded files
37
+ TEMP_DOWNLOAD_DIR = os.getenv('TEMP_DOWNLOAD_DIR', './temp_downloads')
38
+
39
  # Application Configuration
40
  DEBUG = os.getenv('DEBUG', 'False') == 'True'
41
+
42
+ # Google Drive Configuration
43
+ GOOGLE_DRIVE_FOLDER_ID=os.getenv('GOOGLE_DRIVE_FOLDER_ID', '')
44
+ GOOGLE_SERVICE_ACCOUNT_PATH = os.getenv('GOOGLE_SERVICE_ACCOUNT_PATH', 'service_account.json')
45
+
46
+ # GOOGLE_DRIVE_FOLDER_ID = os.getenv('GOOGLE_DRIVE_FOLDER_ID', '')
47
+ # GOOGLE_OAUTH_CLIENT_ID = os.getenv('GOOGLE_OAUTH_CLIENT_ID', '')
48
+ # GOOGLE_OAUTH_CLIENT_SECRET = os.getenv('GOOGLE_OAUTH_CLIENT_SECRET', '')
49
+ # GOOGLE_OAUTH_REDIRECT_URI = os.getenv('GOOGLE_OAUTH_REDIRECT_URI', 'http://127.0.0.1:8000/google/oauth2callback')
50
+
51
+ # @property
52
+ # def google_oauth_flow(self):
53
+ # flow = Flow.from_client_config({
54
+ # "web": {
55
+ # "client_id": self.GOOGLE_OAUTH_CLIENT_ID,
56
+ # "client_secret": self.GOOGLE_OAUTH_CLIENT_SECRET,
57
+ # "auth_uri": "https://accounts.google.com/o/oauth2/auth",
58
+ # "token_uri": "https://oauth2.googleapis.com/token",
59
+ # "redirect_uris": [self.GOOGLE_OAUTH_REDIRECT_URI],
60
+ # "javascript_origins": ["http://localhost:8000", "http://127.0.0.1:8000"]
61
+ # }
62
+ # }, scopes=['https://www.googleapis.com/auth/drive.readonly'])
63
+ # flow.redirect_uri = self.GOOGLE_OAUTH_REDIRECT_URI
64
+ # return flow
65
 
66
  settings = Settings()
service_account.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "type": "service_account",
3
+ "project_id": "demochatbotpropx",
4
+ "private_key_id": "7afaa51c4fc75d0d25668e84032d12622408356e",
5
+ "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQC9Or6H3z2BSL24\nfc41WTxUmmhzXUggT8cM0VB/5PTMat8j/vmVhia7mwpMnNEU5f4xcwuF0Lkhjsj2\nCryliph3dPxPUaVCy7aFBatl6kjvf9lBh3bTtIXtONjBN9w2UEqOc85zTpBJxqrT\nNb50VUnZhEDeWAcvB5L3l5+EZxlKUN6HDCdadhLWK4MMuk9XCwQvfqlWR/kT8TJd\nVa408Q4sEoE5F0w5g2epPGBCbCN/FonFq1Mp1Q+kUDmJvdVma4JVBVHSDrqszKum\n/9UPREIEIWar5FPkIVFDwhpKTIWKuwwfdSfbuchVihv1POYksx8NebzxJigP4TWG\nE+QuPpodAgMBAAECggEAC3DMfMkakVrCA0nL5Veehg9XbR4nLBjwRzScG6Q0+4Tv\nz8Y5j18IfKnStUlH4aEUI/Sfx9JHClUOwZCdnjT81qZ8HxmOSc8PAaGs0eEYnsfj\nU6RqiUGUiIA6DqVSpMM1XSgTdwI5em4B/WB2KHEEB+ju+RWbMMgFWTKefAu1sKBY\n1GvNw0QhiizqQN0q2CNSSPqDkPkb24YNTuX0i3XAt3lel3JFbv/SsRgX8NHbO0aU\nPlDleqQF6wWVu5wVmwbw8digXDcsPw/8gb/EqEfgaKLz0/bKd74nyKA/pGZnD+KN\n1pusV+iYymfaejFmSdRPMVq0NfWqRLRw8i7T1xnYmwKBgQD4+cPaeVt1kCqXtCEm\nhRofAa9rPnhkHb44pUffalYgdLi9vT31H1bHh9D8leN4Df1sLQrTttxa7QaVIPfX\nE64yAShyu603nmZ/GUAvtUy54lB76SOjHMWNhbCyEzDcOYSkjH3SedlSYX1xqYNl\nN0GUFeaBDS1PUyyYTSN2rZRYpwKBgQDCkXXgYirMaw2jksij2Ru8ZdT/Dg54YhHQ\n6+/xV+K+MCnt6vw3qloatJzOdXEnulndjl71d8WVYlpBkHzHEaLy3XgA6FGtwq6J\nvKS1w/FZcDi2ra0RFtAi844/HUftbB1ZBSe93nxHNw10XRzFHtFc6OPFcFfOmhgJ\nnYLbH2xLmwKBgBtD9u/RBHQOcqukXVEDmIW2wIglEjgcjb1UVFeiJIZvYd/dfpB+\nexlkxT00CPIXzh3vnNTsnJsUg/kG4D1ceWIegFh4NxL1NNJMaJwQ5bMhlqDLOkzd\nlMDX2C7YLSyg2+bNP+Yx09vSs1MkNjB6aaMW9uRBFiouuJ6BLBYOEkXXAoGBAJp3\n3Tuc9BmCTDu2xu+959U0i1tKj5ZnVXmmNsJGYc9YcZFfY4nWBt740RzgBEvkGIBb\nDWyYABdPFBTFXyq0B8gEp8cgqefnjaXwTFu6ChxVidEOJT5R/EAjWKUm2/nUQaBx\nBVIqFkR7ooTlf3fHtbOreVlAjZWKpNbNZBwO4G1NAoGAU3HX5fyC+0OWMi22cIha\nyUivxH8ti6JmfNJllRr5V3bHHqKrLCPC8tgAZouvutendjm3beNhTeQGoY2QsNhI\nF5NC8euwpYMLhhwVvTB0G6sBxplZZ3FUNMUtpFQ0qvb6VYNQOa2+qBagemhEPhxs\nBfQH0FnqRjBjGoaX7nw6FnU=\n-----END PRIVATE KEY-----\n",
6
+ "client_email": "[email protected]",
7
+ "client_id": "108855684700262853537",
8
+ "auth_uri": "https://accounts.google.com/o/oauth2/auth",
9
+ "token_uri": "https://oauth2.googleapis.com/token",
10
+ "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
11
+ "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/demochatbotpropx%40demochatbotpropx.iam.gserviceaccount.com",
12
+ "universe_domain": "googleapis.com"
13
+ }
src/__pycache__/main.cpython-312.pyc CHANGED
Binary files a/src/__pycache__/main.cpython-312.pyc and b/src/__pycache__/main.cpython-312.pyc differ
 
src/agents/__pycache__/excel_aware_rag.cpython-312.pyc CHANGED
Binary files a/src/agents/__pycache__/excel_aware_rag.cpython-312.pyc and b/src/agents/__pycache__/excel_aware_rag.cpython-312.pyc differ
 
src/agents/__pycache__/rag_agent.cpython-312.pyc CHANGED
Binary files a/src/agents/__pycache__/rag_agent.cpython-312.pyc and b/src/agents/__pycache__/rag_agent.cpython-312.pyc differ
 
src/agents/__pycache__/system_instructions_rag.cpython-312.pyc ADDED
Binary file (8.58 kB). View file
 
src/agents/system_instructions_rag.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/agents/system_instructions_rag.py
2
+ from typing import List, Dict, Optional
3
+ from src.agents.rag_agent import RAGResponse
4
+ from src.utils.logger import logger
5
+ from src.agents.rag_agent import RAGAgent
6
+
7
+ class SystemInstructionsRAGAgent(RAGAgent):
8
+ """RAG Agent with enhanced system instructions for specific use cases"""
9
+
10
+ async def generate_response(
11
+ self,
12
+ query: str,
13
+ conversation_id: Optional[str] = None,
14
+ temperature: float = 0.7,
15
+ max_tokens: Optional[int] = None,
16
+ context_docs: Optional[List[str]] = None
17
+ ) -> RAGResponse:
18
+ """
19
+ Generate response with specific handling for introduction and no-context cases
20
+ """
21
+ try:
22
+ # First, check if this is an introduction/welcome message query
23
+ is_introduction = (
24
+ "wants support" in query and
25
+ "This is Introduction" in query and
26
+ ("A new user with name:" in query or "An old user with name:" in query)
27
+ )
28
+
29
+ if is_introduction:
30
+ # Handle introduction message - no context needed
31
+ welcome_message = self._handle_contact_query(query)
32
+ return RAGResponse(
33
+ response=welcome_message,
34
+ context_docs=[],
35
+ sources=[],
36
+ scores=None
37
+ )
38
+
39
+ # For all other queries, proceed with context-based response
40
+ if not context_docs:
41
+ context_docs, sources, scores = await self.retrieve_context(
42
+ query,
43
+ conversation_history=[]
44
+ )
45
+
46
+ # Check if we have relevant context
47
+ has_relevant_context = self._check_context_relevance(query, context_docs or [])
48
+
49
+ # If no relevant context found, return the standard message
50
+ if not has_relevant_context:
51
+ return RAGResponse(
52
+ response="Information about this is not available, do you want to inquire about something else?",
53
+ context_docs=[],
54
+ sources=[],
55
+ scores=None
56
+ )
57
+
58
+ # Generate response using context
59
+ prompt = self._create_response_prompt(query, context_docs)
60
+ response_text = self.llm.generate(
61
+ prompt,
62
+ temperature=temperature,
63
+ max_tokens=max_tokens
64
+ )
65
+
66
+ # Check if the generated response indicates no information
67
+ cleaned_response = self._clean_response(response_text)
68
+ if self._is_no_info_response(cleaned_response):
69
+ return RAGResponse(
70
+ response="Information about this is not available, do you want to inquire about something else?",
71
+ context_docs=[],
72
+ sources=[],
73
+ scores=None
74
+ )
75
+
76
+ return RAGResponse(
77
+ response=cleaned_response,
78
+ context_docs=context_docs,
79
+ sources=sources,
80
+ scores=scores
81
+ )
82
+
83
+ except Exception as e:
84
+ logger.error(f"Error in SystemInstructionsRAGAgent: {str(e)}")
85
+ raise
86
+
87
+ def _is_no_info_response(self, response: str) -> bool:
88
+ """Check if the response indicates no information available"""
89
+ no_info_indicators = [
90
+ "i do not have",
91
+ "i don't have",
92
+ "no information",
93
+ "not available",
94
+ "could not find",
95
+ "couldn't find",
96
+ "cannot find"
97
+ ]
98
+ response_lower = response.lower()
99
+ return any(indicator in response_lower for indicator in no_info_indicators)
100
+
101
+ def _check_context_relevance(self, query: str, context_docs: List[str]) -> bool:
102
+ """Check if context contains information relevant to the query"""
103
+ if not context_docs:
104
+ return False
105
+
106
+ # Extract key terms from query
107
+ query_words = query.lower().split()
108
+ stop_words = {'share', 'me', 'a', 'about', 'information', 'what', 'is', 'are', 'the', 'in', 'how', 'why', 'when', 'where'}
109
+ query_terms = {word for word in query_words if word not in stop_words}
110
+
111
+ # Check each context document for relevance
112
+ for doc in context_docs:
113
+ if not doc:
114
+ continue
115
+ doc_lower = doc.lower()
116
+ if any(term in doc_lower for term in query_terms):
117
+ # Found relevant content
118
+ return True
119
+ return False
120
+
121
+ def _create_response_prompt(self, query: str, context_docs: List[str]) -> str:
122
+ """Create prompt for generating response from context"""
123
+ formatted_context = '\n\n'.join(
124
+ f"Context {i+1}:\n{doc.strip()}"
125
+ for i, doc in enumerate(context_docs)
126
+ if doc and doc.strip()
127
+ )
128
+
129
+ return f"""
130
+ Use ONLY the following context to provide information about: {query}
131
+
132
+ {formatted_context}
133
+
134
+ Instructions:
135
+ 1. Use ONLY information present in the context above
136
+ 2. If the information is found in the context, provide a direct and concise response
137
+ 3. Do not make assumptions or add information not present in the context
138
+ 4. Ensure the response is clear and complete based on available information
139
+ 5. If you cannot find relevant information about the specific query in the context,
140
+ respond exactly with: "Information about this is not available, do you want to inquire about something else?"
141
+
142
+ Query: {query}
143
+ Response:"""
144
+
145
+ def _handle_contact_query(self, query: str) -> str:
146
+ """Handle queries from /user/contact endpoint"""
147
+ try:
148
+ name_start = query.find('name: "') + 7
149
+ name_end = query.find('"', name_start)
150
+ name = query[name_start:name_end] if name_start > 6 and name_end != -1 else "there"
151
+
152
+ is_returning = (
153
+ "An old user with name:" in query and
154
+ "wants support again" in query
155
+ )
156
+
157
+ if is_returning:
158
+ return f"Welcome back {name}, How can I help you?"
159
+ return f"Welcome {name}, How can I help you?"
160
+
161
+ except Exception as e:
162
+ logger.error(f"Error handling contact query: {str(e)}")
163
+ return "Welcome, How can I help you?"
164
+
165
+ def _clean_response(self, response: str) -> str:
166
+ """Clean response by removing unwanted phrases"""
167
+ if not response:
168
+ return response
169
+
170
+ phrases_to_remove = [
171
+ "Based on the context provided,",
172
+ "According to the documents,",
173
+ "From the information available,",
174
+ "I can tell you that",
175
+ "Let me help you with that",
176
+ "I understand you're asking about",
177
+ "To answer your question,",
178
+ "The documents indicate that",
179
+ "Based on the available information,",
180
+ "As per the provided context,",
181
+ "I would be happy to help you with that",
182
+ "Let me provide you with information about",
183
+ "Here's what I found:",
184
+ "Here's the information you requested:",
185
+ "According to the provided information,",
186
+ "Based on the documents,",
187
+ "The information suggests that",
188
+ "From what I can see,",
189
+ "Let me explain",
190
+ "To clarify,",
191
+ "It appears that",
192
+ "I can see that",
193
+ "Sure,",
194
+ "Well,",
195
+ "Based on the given context,",
196
+ "The available information shows that",
197
+ "From the context provided,",
198
+ "The documentation mentions that",
199
+ "According to the context,",
200
+ "As shown in the context,",
201
+ "I apologize,"
202
+ ]
203
+
204
+ cleaned_response = response
205
+ for phrase in phrases_to_remove:
206
+ cleaned_response = cleaned_response.replace(phrase, "").strip()
207
+
208
+ cleaned_response = " ".join(cleaned_response.split())
209
+
210
+ if not cleaned_response:
211
+ return response
212
+
213
+ if cleaned_response[0].islower():
214
+ cleaned_response = cleaned_response[0].upper() + cleaned_response[1:]
215
+
216
+ return cleaned_response
src/db/__pycache__/mongodb_store.cpython-312.pyc CHANGED
Binary files a/src/db/__pycache__/mongodb_store.cpython-312.pyc and b/src/db/__pycache__/mongodb_store.cpython-312.pyc differ
 
src/main.py CHANGED
@@ -8,12 +8,23 @@ import uuid
8
  from datetime import datetime
9
  from pathlib import Path
10
  import os
 
 
 
 
 
 
 
 
 
11
 
12
  # Import custom modules1
13
- from src.agents.rag_agent import RAGAgent
 
14
  from src.models.document import AllDocumentsResponse, StoredDocument
15
  from src.models.UserContact import UserContactRequest
16
  from src.utils.document_processor import DocumentProcessor
 
17
  from src.utils.conversation_summarizer import ConversationSummarizer
18
  from src.utils.logger import logger
19
  from src.utils.llm_utils import get_llm_instance, get_vector_store
@@ -43,6 +54,8 @@ app.add_middleware(
43
  allow_headers=["*"], # Allows all headers
44
  )
45
 
 
 
46
  # Initialize MongoDB
47
  mongodb = MongoDBStore(settings.MONGODB_URI)
48
 
@@ -70,6 +83,43 @@ async def verify_api_key(api_key: str = Depends(API_KEY_HEADER)):
70
  )
71
  return api_key
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  @app.get("/documents")
74
  async def get_all_documents():
75
  """Get all documents from MongoDB"""
@@ -147,7 +197,8 @@ async def upload_documents(
147
  logger.error(f"Error in document upload: {str(e)}")
148
  raise HTTPException(status_code=500, detail=str(e))
149
 
150
- @app.get("/documentchunks/{document_id}")
 
151
  async def get_document_chunks(document_id: str):
152
  """Get all chunks for a specific document"""
153
  try:
@@ -207,8 +258,31 @@ async def delete_document(document_id: str):
207
  logger.error(f"Error in delete_document endpoint: {str(e)}")
208
  raise HTTPException(status_code=500, detail=str(e))
209
 
210
- # src/main.py
211
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  @app.post("/user/contact", response_model=ChatResponse)
213
  async def create_user_contact(
214
  request: UserContactRequest,
@@ -224,7 +298,7 @@ async def create_user_contact(
224
 
225
  if existing_conversation_id:
226
  chat_request = ChatRequest(
227
- query=f'An old user with name: "{request.full_name}", email: "{request.email}" and phone number: "{request.phone_number}" wants support again. Create a welcome back message for him and ask how i can help you today?',
228
  llm_provider="openai",
229
  max_context_docs=3,
230
  temperature=1.0,
@@ -242,7 +316,7 @@ async def create_user_contact(
242
  )
243
 
244
  chat_request = ChatRequest(
245
- query=f'A new user with name: "{request.full_name}", email: "{request.email}" and phone number: "{request.phone_number}" wants support. Create a welcome message for him and ask how i can help you today?',
246
  llm_provider="openai",
247
  max_context_docs=3,
248
  temperature=1.0,
@@ -272,13 +346,40 @@ async def chat_endpoint(
272
  llm = get_llm_instance(request.llm_provider)
273
 
274
  # Initialize RAG agent
275
- rag_agent = RAGAgent(
 
 
 
 
 
 
 
276
  llm=llm,
277
  embedding=embedding_model,
278
  vector_store=vector_store,
279
  mongodb=mongodb
280
  )
281
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  # Use provided conversation ID or create new one
283
  conversation_id = request.conversation_id or str(uuid.uuid4())
284
 
@@ -287,7 +388,7 @@ async def chat_endpoint(
287
 
288
  # Add specific instructions for certain types of queries
289
  #if "introduce" in query.lower() or "name" in query.lower() or "email" in query.lower():
290
- query += ". The response should be short and to the point. Make sure to not add any irrelevant information. Keep the introduction concise and friendly."
291
 
292
  # Generate response
293
  logger.info(f"Generating response: {str(datetime.now())}")
 
8
  from datetime import datetime
9
  from pathlib import Path
10
  import os
11
+ import asyncio
12
+ os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1'
13
+ #os.environ["OAUTHLIB_RELAX_TOKEN_SCOPE"] = "1"
14
+
15
+
16
+ from fastapi.responses import RedirectResponse
17
+ from google.oauth2.credentials import Credentials
18
+ from google_auth_oauthlib.flow import Flow
19
+ from src.utils.google_drive_service import GoogleDriveService
20
 
21
  # Import custom modules1
22
+ #from src.agents.rag_agent import RAGAgent
23
+ from src.agents.system_instructions_rag import SystemInstructionsRAGAgent
24
  from src.models.document import AllDocumentsResponse, StoredDocument
25
  from src.models.UserContact import UserContactRequest
26
  from src.utils.document_processor import DocumentProcessor
27
+ from src.utils.drive_document_processor import DriveDocumentProcessor
28
  from src.utils.conversation_summarizer import ConversationSummarizer
29
  from src.utils.logger import logger
30
  from src.utils.llm_utils import get_llm_instance, get_vector_store
 
54
  allow_headers=["*"], # Allows all headers
55
  )
56
 
57
+ #google_drive_service = GoogleDriveService()
58
+
59
  # Initialize MongoDB
60
  mongodb = MongoDBStore(settings.MONGODB_URI)
61
 
 
83
  )
84
  return api_key
85
 
86
+ # @app.get("/google/auth")
87
+ # async def google_auth():
88
+ # authorization_url, _ = settings.google_oauth_flow.authorization_url(
89
+ # access_type='offline',
90
+ # prompt='consent',
91
+ # include_granted_scopes='true'
92
+ # )
93
+ # return RedirectResponse(authorization_url)
94
+
95
+ # @app.get("/google/oauth2callback")
96
+ # async def google_auth_callback(code: str):
97
+ # flow = Flow.from_client_config({
98
+ # "web": {
99
+ # "client_id": settings.GOOGLE_OAUTH_CLIENT_ID,
100
+ # "client_secret": settings.GOOGLE_OAUTH_CLIENT_SECRET,
101
+ # "auth_uri": "https://accounts.google.com/o/oauth2/auth",
102
+ # "token_uri": "https://oauth2.googleapis.com/token",
103
+ # "redirect_uris": [settings.GOOGLE_OAUTH_REDIRECT_URI]
104
+ # }
105
+ # }, scopes=['https://www.googleapis.com/auth/drive.readonly'])
106
+
107
+ # flow.redirect_uri = settings.GOOGLE_OAUTH_REDIRECT_URI
108
+
109
+ # # Add access type and prompt parameters for refresh token
110
+ # flow.fetch_token(
111
+ # code=code,
112
+ # access_type='offline',
113
+ # prompt='consent'
114
+ # )
115
+ # credentials = flow.credentials
116
+
117
+ # return {
118
+ # "message": "Authentication successful",
119
+ # "credentials": credentials.to_json()
120
+ # }
121
+
122
+
123
  @app.get("/documents")
124
  async def get_all_documents():
125
  """Get all documents from MongoDB"""
 
197
  logger.error(f"Error in document upload: {str(e)}")
198
  raise HTTPException(status_code=500, detail=str(e))
199
 
200
+
201
+ @app.get("/documentChunks/{document_id}")
202
  async def get_document_chunks(document_id: str):
203
  """Get all chunks for a specific document"""
204
  try:
 
258
  logger.error(f"Error in delete_document endpoint: {str(e)}")
259
  raise HTTPException(status_code=500, detail=str(e))
260
 
261
+ @app.post("/processDriveDocuments")
262
+ async def process_drive_documents():
263
+ try:
264
+ # Initialize vector store
265
+ vector_store, _ = await get_vector_store()
266
+
267
+ # Initialize Drive document processor
268
+ drive_processor = DriveDocumentProcessor(
269
+ google_service_account_path=settings.GOOGLE_SERVICE_ACCOUNT_PATH,
270
+ folder_id=settings.GOOGLE_DRIVE_FOLDER_ID,
271
+ temp_dir=settings.TEMP_DOWNLOAD_DIR,
272
+ doc_processor=doc_processor
273
+ )
274
+
275
+ # Process documents
276
+ result = await drive_processor.process_documents(vector_store)
277
+ return result
278
+
279
+ except Exception as e:
280
+ logger.error(f"Error in process_drive_documents: {str(e)}")
281
+ raise HTTPException(
282
+ status_code=500,
283
+ detail=str(e)
284
+ )
285
+
286
  @app.post("/user/contact", response_model=ChatResponse)
287
  async def create_user_contact(
288
  request: UserContactRequest,
 
298
 
299
  if existing_conversation_id:
300
  chat_request = ChatRequest(
301
+ query=f'An old user with name: "{request.full_name}", email: "{request.email}" and phone number: "{request.phone_number}" wants support again. This is Introduction Create a welcome back message for him and ask how i can help you today?',
302
  llm_provider="openai",
303
  max_context_docs=3,
304
  temperature=1.0,
 
316
  )
317
 
318
  chat_request = ChatRequest(
319
+ query=f'A new user with name: "{request.full_name}", email: "{request.email}" and phone number: "{request.phone_number}" wants support. This is Introduction Create a welcome message for him and ask how i can help you today?',
320
  llm_provider="openai",
321
  max_context_docs=3,
322
  temperature=1.0,
 
346
  llm = get_llm_instance(request.llm_provider)
347
 
348
  # Initialize RAG agent
349
+ # rag_agent = RAGAgent(
350
+ # llm=llm,
351
+ # embedding=embedding_model,
352
+ # vector_store=vector_store,
353
+ # mongodb=mongodb
354
+ # )
355
+
356
+ rag_agent = SystemInstructionsRAGAgent(
357
  llm=llm,
358
  embedding=embedding_model,
359
  vector_store=vector_store,
360
  mongodb=mongodb
361
  )
362
 
363
+ # rag_agent.add_custom_role(
364
+ # "Knowledge based chatbot and introduction specialist",
365
+ # """You are a welcome agent with knowledge based specialist focusing on knowledge attached and create a beautiful welcome message.
366
+ # Your role is to:
367
+ # 1. Your response should be short and to the point.
368
+ # 2. Strictly follow this point for If it is an introduction. You strictly respond that "Welcome name of customer to our platform. How can I help you today?"
369
+ # """
370
+ # )
371
+
372
+ # rag_agent.add_custom_role(
373
+ # "Knowledge based chatbot",
374
+ # """You are a knowledge based specialist focusing on knowledge attached.
375
+ # Your role is to:
376
+ # 1. Your response should be short and to the point.
377
+ # 2. if it is not introduction then make sure to share the response from Vector store.
378
+ # 3. If you do not find relevant information. Just say I do not have this information but this do not apply to introduction message.
379
+ # 4. If there is an introduction, you should ignore above roles and connect with LLm to have a welcome message for the user.
380
+ # """
381
+ # )
382
+
383
  # Use provided conversation ID or create new one
384
  conversation_id = request.conversation_id or str(uuid.uuid4())
385
 
 
388
 
389
  # Add specific instructions for certain types of queries
390
  #if "introduce" in query.lower() or "name" in query.lower() or "email" in query.lower():
391
+ #query += ". The response should be short and to the point. Make sure to not add any irrelevant information. make sure to share the response from Vector store, if you do not find information in vector store. Just respond I do not have information. Keep the introduction concise and friendly."
392
 
393
  # Generate response
394
  logger.info(f"Generating response: {str(datetime.now())}")
src/models/__pycache__/UserContact.cpython-312.pyc CHANGED
Binary files a/src/models/__pycache__/UserContact.cpython-312.pyc and b/src/models/__pycache__/UserContact.cpython-312.pyc differ
 
src/models/__pycache__/chat.cpython-312.pyc CHANGED
Binary files a/src/models/__pycache__/chat.cpython-312.pyc and b/src/models/__pycache__/chat.cpython-312.pyc differ
 
src/models/__pycache__/rag.cpython-312.pyc CHANGED
Binary files a/src/models/__pycache__/rag.cpython-312.pyc and b/src/models/__pycache__/rag.cpython-312.pyc differ
 
src/utils/__pycache__/conversation_manager.cpython-312.pyc CHANGED
Binary files a/src/utils/__pycache__/conversation_manager.cpython-312.pyc and b/src/utils/__pycache__/conversation_manager.cpython-312.pyc differ
 
src/utils/__pycache__/database_cleanup.cpython-312.pyc CHANGED
Binary files a/src/utils/__pycache__/database_cleanup.cpython-312.pyc and b/src/utils/__pycache__/database_cleanup.cpython-312.pyc differ
 
src/utils/__pycache__/document_processor.cpython-312.pyc CHANGED
Binary files a/src/utils/__pycache__/document_processor.cpython-312.pyc and b/src/utils/__pycache__/document_processor.cpython-312.pyc differ
 
src/utils/__pycache__/drive_document_processor.cpython-312.pyc ADDED
Binary file (10.8 kB). View file
 
src/utils/__pycache__/enhanced_excel_processor.cpython-312.pyc CHANGED
Binary files a/src/utils/__pycache__/enhanced_excel_processor.cpython-312.pyc and b/src/utils/__pycache__/enhanced_excel_processor.cpython-312.pyc differ
 
src/utils/__pycache__/google_drive_service.cpython-312.pyc ADDED
Binary file (3.94 kB). View file
 
src/utils/__pycache__/llm_utils.cpython-312.pyc CHANGED
Binary files a/src/utils/__pycache__/llm_utils.cpython-312.pyc and b/src/utils/__pycache__/llm_utils.cpython-312.pyc differ
 
src/utils/drive_document_processor.py ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/utils/drive_document_processor.py
2
+ from pathlib import Path
3
+ from typing import Dict, List, Any, Tuple
4
+ import logging
5
+ from fastapi import HTTPException
6
+
7
+ from src.utils.google_drive_service import GoogleDriveService
8
+ from src.utils.document_processor import DocumentProcessor
9
+ from src.vectorstores.chroma_vectorstore import ChromaVectorStore
10
+ from src.utils.logger import logger
11
+
12
+ class DriveDocumentProcessor:
13
+ def __init__(
14
+ self,
15
+ google_service_account_path: str,
16
+ folder_id: str,
17
+ temp_dir: str,
18
+ doc_processor: DocumentProcessor
19
+ ):
20
+ """
21
+ Initialize Drive Document Processor
22
+
23
+ Args:
24
+ google_service_account_path (str): Path to Google service account credentials
25
+ folder_id (str): Google Drive folder ID to process
26
+ temp_dir (str): Directory for temporary files
27
+ doc_processor (DocumentProcessor): Instance of DocumentProcessor
28
+ """
29
+ self.google_drive_service = GoogleDriveService(google_service_account_path)
30
+ self.folder_id = folder_id
31
+ self.temp_dir = Path(temp_dir)
32
+ self.doc_processor = doc_processor
33
+
34
+ # Create temp directory if it doesn't exist
35
+ self.temp_dir.mkdir(exist_ok=True)
36
+
37
+ # Define supported MIME types
38
+ self.supported_mime_types = {
39
+ # Google Docs
40
+ 'application/vnd.google-apps.document': '.docx', # Export Google Docs as DOCX
41
+
42
+ # Microsoft Word Documents
43
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
44
+ 'application/msword': '.doc',
45
+
46
+ # Microsoft Excel Documents
47
+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
48
+ 'application/vnd.ms-excel': '.xls',
49
+
50
+ # Text Documents
51
+ 'text/plain': '.txt',
52
+ 'text/csv': '.csv',
53
+ 'text/markdown': '.md',
54
+ 'text/html': '.html',
55
+ 'text/xml': '.xml',
56
+ 'application/json': '.json',
57
+ 'application/rtf': '.rtf',
58
+
59
+ # PDF Documents
60
+ 'application/pdf': '.pdf'
61
+ }
62
+
63
+ # Define export MIME types for Google Docs formats
64
+ self.google_docs_export_types = {
65
+ 'application/vnd.google-apps.document': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
66
+ }
67
+
68
+ async def process_documents(
69
+ self,
70
+ vector_store: ChromaVectorStore
71
+ ) -> Dict[str, Any]:
72
+ """
73
+ Process all documents in the specified Drive folder
74
+
75
+ Args:
76
+ vector_store (ChromaVectorStore): Vector store instance
77
+
78
+ Returns:
79
+ Dict[str, Any]: Processing results
80
+ """
81
+ try:
82
+ # Get documents from folder
83
+ files = self.google_drive_service.get_folder_contents(self.folder_id)
84
+
85
+ processed_files = []
86
+ skipped_files = []
87
+ errors = []
88
+
89
+ for file in files:
90
+ result = await self._process_single_file(file, vector_store)
91
+
92
+ if result['status'] == 'processed':
93
+ processed_files.append(result['data'])
94
+ elif result['status'] == 'skipped':
95
+ skipped_files.append(result['data'])
96
+ else: # status == 'error'
97
+ errors.append(result['data'])
98
+
99
+ # Clean up temporary directory if empty
100
+ self._cleanup_temp_dir()
101
+
102
+ return {
103
+ "status": "completed",
104
+ "processed_files": {
105
+ "count": len(processed_files),
106
+ "details": processed_files
107
+ },
108
+ "skipped_files": {
109
+ "count": len(skipped_files),
110
+ "details": skipped_files
111
+ },
112
+ "errors": {
113
+ "count": len(errors),
114
+ "details": errors
115
+ }
116
+ }
117
+
118
+ except Exception as e:
119
+ logger.error(f"Error processing Drive documents: {str(e)}")
120
+ raise HTTPException(
121
+ status_code=500,
122
+ detail=f"Failed to process drive documents: {str(e)}"
123
+ )
124
+
125
+ async def _process_single_file(
126
+ self,
127
+ file: Dict[str, Any],
128
+ vector_store: ChromaVectorStore
129
+ ) -> Dict[str, Any]:
130
+ """Process a single Drive file"""
131
+
132
+ mime_type = file.get('mimeType', '')
133
+
134
+ # Skip if mime type not supported
135
+ if mime_type not in self.supported_mime_types:
136
+ return {
137
+ 'status': 'skipped',
138
+ 'data': {
139
+ 'name': file['name'],
140
+ 'reason': f'Unsupported mime type: {mime_type}'
141
+ }
142
+ }
143
+
144
+ try:
145
+ document_id = file['id']
146
+ modified_time = file.get('modifiedTime', 'N/A') # Get last modified time
147
+
148
+ # Check if document should be processed
149
+ if self.save_document(document_id, vector_store, modified_time):
150
+ # Download and process file
151
+ temp_file_path = await self._download_and_save_file(
152
+ file['id'],
153
+ mime_type
154
+ )
155
+
156
+ try:
157
+ # Process document
158
+ processed_doc = await self.doc_processor.process_document(
159
+ str(temp_file_path)
160
+ )
161
+
162
+ # Add to vector store
163
+ self._add_to_vector_store(
164
+ processed_doc['chunks'],
165
+ file,
166
+ mime_type,
167
+ vector_store
168
+ )
169
+
170
+ return {
171
+ 'status': 'processed',
172
+ 'data': {
173
+ 'name': file['name'],
174
+ 'id': file['id'],
175
+ 'chunks_processed': len(processed_doc['chunks'])
176
+ }
177
+ }
178
+
179
+ finally:
180
+ # Clean up temporary file
181
+ if temp_file_path.exists():
182
+ temp_file_path.unlink()
183
+ else:
184
+ # Return skipped status if document already exists and is up to date
185
+ return {
186
+ 'status': 'skipped',
187
+ 'data': {
188
+ 'name': file['name'],
189
+ 'reason': 'Document already exists in the memory.'
190
+ }
191
+ }
192
+
193
+ except Exception as e:
194
+ logger.error(f"Error processing file {file['name']}: {str(e)}")
195
+ return {
196
+ 'status': 'error',
197
+ 'data': {
198
+ 'file_name': file['name'],
199
+ 'error': str(e)
200
+ }
201
+ }
202
+
203
+ except Exception as e:
204
+ logger.error(f"Error processing file {file['name']}: {str(e)}")
205
+ return {
206
+ 'status': 'error',
207
+ 'data': {
208
+ 'file_name': file['name'],
209
+ 'error': str(e)
210
+ }
211
+ }
212
+
213
+ async def _download_and_save_file(
214
+ self,
215
+ file_id: str,
216
+ mime_type: str
217
+ ) -> Path:
218
+ """Download and save file to temporary location"""
219
+ extension = self.supported_mime_types[mime_type]
220
+ temp_file_path = self.temp_dir / f"{file_id}{extension}"
221
+
222
+ if mime_type in self.google_docs_export_types:
223
+ # Download Google Doc in the specified export format
224
+ content = self.google_drive_service.export_file(
225
+ file_id,
226
+ self.google_docs_export_types[mime_type]
227
+ )
228
+ else:
229
+ # Download regular file
230
+ content = self.google_drive_service.download_file(file_id)
231
+
232
+ with open(temp_file_path, 'wb') as f:
233
+ if isinstance(content, str):
234
+ f.write(content.encode('utf-8'))
235
+ else:
236
+ f.write(content)
237
+
238
+ return temp_file_path
239
+
240
+ def _add_to_vector_store(
241
+ self,
242
+ chunks: List[str],
243
+ file: Dict[str, Any],
244
+ mime_type: str,
245
+ vector_store: ChromaVectorStore
246
+ ) -> None:
247
+ """Add processed chunks to vector store"""
248
+ chunk_metadatas = []
249
+ chunk_ids = []
250
+
251
+ # document_id = file['id']
252
+ modified_time = file.get('modifiedTime', 'N/A') # Get last modified time
253
+ #self.delete_updated_document(document_id, vector_store, modified_time)
254
+
255
+
256
+ for i, chunk in enumerate(chunks):
257
+ chunk_id = f"{file['id']}-chunk-{i}"
258
+ chunk_ids.append(chunk_id)
259
+ chunk_metadatas.append({
260
+ "source": file['name'],
261
+ "document_id": file['id'],
262
+ "chunk_index": i,
263
+ "mime_type": mime_type,
264
+ "modified_time": modified_time,
265
+ "total_chunks": len(chunks),
266
+ "file_type": self.supported_mime_types[mime_type],
267
+ "is_google_doc": mime_type.startswith('application/vnd.google-apps')
268
+ })
269
+
270
+ vector_store.add_documents(
271
+ documents=chunks,
272
+ metadatas=chunk_metadatas,
273
+ ids=chunk_ids
274
+ )
275
+
276
+ def save_document(self, document_id: str, vector_store: ChromaVectorStore, modified_date: str) -> bool:
277
+ """
278
+ Deletes all chunks of a document if the modified_time does not match the given modified_date.
279
+
280
+ Args:
281
+ document_id (str): The ID of the document.
282
+ vector_store (ChromaVectorStore): The Chroma vector store instance.
283
+ modified_date (str): The expected modification date.
284
+ """
285
+ try:
286
+ # Retrieve all chunks for the given document_id
287
+ chunks = vector_store.get_document_chunks(document_id)
288
+
289
+ if not chunks:
290
+ logging.warning(f"No chunks found for document_id: {document_id}. Nothing to delete.")
291
+ return True
292
+
293
+ # Check the modified_time of the first chunk
294
+ first_chunk_metadata = chunks[0].get("metadata", {})
295
+
296
+ if first_chunk_metadata.get("modified_time") != modified_date:
297
+ # If modified_time doesn't match, delete all chunks
298
+ vector_store.delete_document(document_id)
299
+ logging.info(f"Deleted all chunks for document_id: {document_id} due to modified_time mismatch.")
300
+ return True
301
+ else:
302
+ logging.info(f"No deletion needed for document_id: {document_id}, modified_time is unchanged.")
303
+ return False
304
+
305
+
306
+ except Exception as e:
307
+ logging.error(f"Error while deleting chunks for document_id {document_id}: {str(e)}")
308
+ return True
309
+
310
+
311
+
312
+ def _cleanup_temp_dir(self) -> None:
313
+ """Clean up temporary directory if empty"""
314
+ if self.temp_dir.exists() and not any(self.temp_dir.iterdir()):
315
+ self.temp_dir.rmdir()
src/utils/google_drive_service.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/utils/google_drive_service.py
2
+ from google.oauth2 import service_account
3
+ from googleapiclient.discovery import build
4
+ from googleapiclient.http import MediaIoBaseDownload
5
+ import io
6
+ import os
7
+
8
+ class GoogleDriveService:
9
+ def __init__(self, credentials_path: str):
10
+ """
11
+ Initialize Google Drive service
12
+
13
+ Args:
14
+ credentials_path (str): Path to service account credentials file
15
+ """
16
+ self.credentials = service_account.Credentials.from_service_account_file(
17
+ credentials_path,
18
+ scopes=['https://www.googleapis.com/auth/drive.readonly']
19
+ )
20
+ self.service = build('drive', 'v3', credentials=self.credentials)
21
+
22
+ def get_folder_contents(self, folder_id: str):
23
+ """
24
+ Get contents of a Drive folder
25
+
26
+ Args:
27
+ folder_id (str): ID of the folder to process
28
+
29
+ Returns:
30
+ List[Dict]: List of file metadata
31
+ """
32
+ query = f"'{folder_id}' in parents and trashed=false"
33
+ results = self.service.files().list(
34
+ q=query,
35
+ fields="files(id, name, mimeType,modifiedTime)",
36
+ supportsAllDrives=True,
37
+ includeItemsFromAllDrives=True
38
+ ).execute()
39
+ return results.get('files', [])
40
+
41
+ def download_file(self, file_id: str) -> bytes:
42
+ """
43
+ Download a file from Drive
44
+
45
+ Args:
46
+ file_id (str): ID of the file to download
47
+
48
+ Returns:
49
+ bytes: File content
50
+ """
51
+ request = self.service.files().get_media(fileId=file_id)
52
+ content = io.BytesIO()
53
+ downloader = MediaIoBaseDownload(content, request)
54
+
55
+ done = False
56
+ while not done:
57
+ _, done = downloader.next_chunk()
58
+
59
+ content.seek(0)
60
+ return content.read()
61
+
62
+ def export_file(self, file_id: str, mime_type: str) -> bytes:
63
+ """
64
+ Export a Google Workspace file to a different format
65
+
66
+ Args:
67
+ file_id (str): ID of the file to export
68
+ mime_type (str): MIME type to export to
69
+
70
+ Returns:
71
+ bytes: Exported file content
72
+ """
73
+ request = self.service.files().export_media(
74
+ fileId=file_id,
75
+ mimeType=mime_type
76
+ )
77
+ content = io.BytesIO()
78
+ downloader = MediaIoBaseDownload(content, request)
79
+
80
+ done = False
81
+ while not done:
82
+ _, done = downloader.next_chunk()
83
+
84
+ content.seek(0)
85
+ return content.read()
src/vectorstores/__pycache__/optimized_vectorstore.cpython-312.pyc CHANGED
Binary files a/src/vectorstores/__pycache__/optimized_vectorstore.cpython-312.pyc and b/src/vectorstores/__pycache__/optimized_vectorstore.cpython-312.pyc differ
 
testfile.txt DELETED
@@ -1 +0,0 @@
1
- testing123