Spaces:
Running
Running
Commit
·
aee2bfd
1
Parent(s):
b953016
Implementation for Google drive is done. Now it is working fine, except need a new function that can delete the chunks if a document is removed from google drive.
Browse files- client_secret_1048685176388-r728705k0ef26t09rblffue3cqeaaco1.apps.googleusercontent.com.json +1 -0
- config/__pycache__/config.cpython-312.pyc +0 -0
- config/config.py +29 -0
- service_account.json +13 -0
- src/__pycache__/main.cpython-312.pyc +0 -0
- src/agents/__pycache__/excel_aware_rag.cpython-312.pyc +0 -0
- src/agents/__pycache__/rag_agent.cpython-312.pyc +0 -0
- src/agents/__pycache__/system_instructions_rag.cpython-312.pyc +0 -0
- src/agents/system_instructions_rag.py +216 -0
- src/db/__pycache__/mongodb_store.cpython-312.pyc +0 -0
- src/main.py +109 -8
- src/models/__pycache__/UserContact.cpython-312.pyc +0 -0
- src/models/__pycache__/chat.cpython-312.pyc +0 -0
- src/models/__pycache__/rag.cpython-312.pyc +0 -0
- src/utils/__pycache__/conversation_manager.cpython-312.pyc +0 -0
- src/utils/__pycache__/database_cleanup.cpython-312.pyc +0 -0
- src/utils/__pycache__/document_processor.cpython-312.pyc +0 -0
- src/utils/__pycache__/drive_document_processor.cpython-312.pyc +0 -0
- src/utils/__pycache__/enhanced_excel_processor.cpython-312.pyc +0 -0
- src/utils/__pycache__/google_drive_service.cpython-312.pyc +0 -0
- src/utils/__pycache__/llm_utils.cpython-312.pyc +0 -0
- src/utils/drive_document_processor.py +315 -0
- src/utils/google_drive_service.py +85 -0
- src/vectorstores/__pycache__/optimized_vectorstore.cpython-312.pyc +0 -0
- testfile.txt +0 -1
client_secret_1048685176388-r728705k0ef26t09rblffue3cqeaaco1.apps.googleusercontent.com.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"web":{"client_id":"1048685176388-r728705k0ef26t09rblffue3cqeaaco1.apps.googleusercontent.com","project_id":"demochatbotpropx","auth_uri":"https://accounts.google.com/o/oauth2/auth","token_uri":"https://oauth2.googleapis.com/token","auth_provider_x509_cert_url":"https://www.googleapis.com/oauth2/v1/certs","client_secret":"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","redirect_uris":["http://127.0.0.1:8000/google/oauth2callback"],"javascript_origins":["http://localhost:8000"]}}
|
config/__pycache__/config.cpython-312.pyc
CHANGED
Binary files a/config/__pycache__/config.cpython-312.pyc and b/config/__pycache__/config.cpython-312.pyc differ
|
|
config/config.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1 |
# config/config.py
|
2 |
import os
|
3 |
from dotenv import load_dotenv
|
|
|
|
|
4 |
|
5 |
# Load environment variables
|
6 |
load_dotenv()
|
@@ -31,7 +33,34 @@ class Settings:
|
|
31 |
# Feedback Configuration
|
32 |
MAX_RATING = int(os.getenv('MAX_RATING', '5'))
|
33 |
|
|
|
|
|
|
|
34 |
# Application Configuration
|
35 |
DEBUG = os.getenv('DEBUG', 'False') == 'True'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
settings = Settings()
|
|
|
1 |
# config/config.py
|
2 |
import os
|
3 |
from dotenv import load_dotenv
|
4 |
+
from google.oauth2.credentials import Credentials
|
5 |
+
from google_auth_oauthlib.flow import Flow
|
6 |
|
7 |
# Load environment variables
|
8 |
load_dotenv()
|
|
|
33 |
# Feedback Configuration
|
34 |
MAX_RATING = int(os.getenv('MAX_RATING', '5'))
|
35 |
|
36 |
+
# Temporary directory for downloaded files
|
37 |
+
TEMP_DOWNLOAD_DIR = os.getenv('TEMP_DOWNLOAD_DIR', './temp_downloads')
|
38 |
+
|
39 |
# Application Configuration
|
40 |
DEBUG = os.getenv('DEBUG', 'False') == 'True'
|
41 |
+
|
42 |
+
# Google Drive Configuration
|
43 |
+
GOOGLE_DRIVE_FOLDER_ID=os.getenv('GOOGLE_DRIVE_FOLDER_ID', '')
|
44 |
+
GOOGLE_SERVICE_ACCOUNT_PATH = os.getenv('GOOGLE_SERVICE_ACCOUNT_PATH', 'service_account.json')
|
45 |
+
|
46 |
+
# GOOGLE_DRIVE_FOLDER_ID = os.getenv('GOOGLE_DRIVE_FOLDER_ID', '')
|
47 |
+
# GOOGLE_OAUTH_CLIENT_ID = os.getenv('GOOGLE_OAUTH_CLIENT_ID', '')
|
48 |
+
# GOOGLE_OAUTH_CLIENT_SECRET = os.getenv('GOOGLE_OAUTH_CLIENT_SECRET', '')
|
49 |
+
# GOOGLE_OAUTH_REDIRECT_URI = os.getenv('GOOGLE_OAUTH_REDIRECT_URI', 'http://127.0.0.1:8000/google/oauth2callback')
|
50 |
+
|
51 |
+
# @property
|
52 |
+
# def google_oauth_flow(self):
|
53 |
+
# flow = Flow.from_client_config({
|
54 |
+
# "web": {
|
55 |
+
# "client_id": self.GOOGLE_OAUTH_CLIENT_ID,
|
56 |
+
# "client_secret": self.GOOGLE_OAUTH_CLIENT_SECRET,
|
57 |
+
# "auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
58 |
+
# "token_uri": "https://oauth2.googleapis.com/token",
|
59 |
+
# "redirect_uris": [self.GOOGLE_OAUTH_REDIRECT_URI],
|
60 |
+
# "javascript_origins": ["http://localhost:8000", "http://127.0.0.1:8000"]
|
61 |
+
# }
|
62 |
+
# }, scopes=['https://www.googleapis.com/auth/drive.readonly'])
|
63 |
+
# flow.redirect_uri = self.GOOGLE_OAUTH_REDIRECT_URI
|
64 |
+
# return flow
|
65 |
|
66 |
settings = Settings()
|
service_account.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"type": "service_account",
|
3 |
+
"project_id": "demochatbotpropx",
|
4 |
+
"private_key_id": "7afaa51c4fc75d0d25668e84032d12622408356e",
|
5 |
+
"private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQC9Or6H3z2BSL24\nfc41WTxUmmhzXUggT8cM0VB/5PTMat8j/vmVhia7mwpMnNEU5f4xcwuF0Lkhjsj2\nCryliph3dPxPUaVCy7aFBatl6kjvf9lBh3bTtIXtONjBN9w2UEqOc85zTpBJxqrT\nNb50VUnZhEDeWAcvB5L3l5+EZxlKUN6HDCdadhLWK4MMuk9XCwQvfqlWR/kT8TJd\nVa408Q4sEoE5F0w5g2epPGBCbCN/FonFq1Mp1Q+kUDmJvdVma4JVBVHSDrqszKum\n/9UPREIEIWar5FPkIVFDwhpKTIWKuwwfdSfbuchVihv1POYksx8NebzxJigP4TWG\nE+QuPpodAgMBAAECggEAC3DMfMkakVrCA0nL5Veehg9XbR4nLBjwRzScG6Q0+4Tv\nz8Y5j18IfKnStUlH4aEUI/Sfx9JHClUOwZCdnjT81qZ8HxmOSc8PAaGs0eEYnsfj\nU6RqiUGUiIA6DqVSpMM1XSgTdwI5em4B/WB2KHEEB+ju+RWbMMgFWTKefAu1sKBY\n1GvNw0QhiizqQN0q2CNSSPqDkPkb24YNTuX0i3XAt3lel3JFbv/SsRgX8NHbO0aU\nPlDleqQF6wWVu5wVmwbw8digXDcsPw/8gb/EqEfgaKLz0/bKd74nyKA/pGZnD+KN\n1pusV+iYymfaejFmSdRPMVq0NfWqRLRw8i7T1xnYmwKBgQD4+cPaeVt1kCqXtCEm\nhRofAa9rPnhkHb44pUffalYgdLi9vT31H1bHh9D8leN4Df1sLQrTttxa7QaVIPfX\nE64yAShyu603nmZ/GUAvtUy54lB76SOjHMWNhbCyEzDcOYSkjH3SedlSYX1xqYNl\nN0GUFeaBDS1PUyyYTSN2rZRYpwKBgQDCkXXgYirMaw2jksij2Ru8ZdT/Dg54YhHQ\n6+/xV+K+MCnt6vw3qloatJzOdXEnulndjl71d8WVYlpBkHzHEaLy3XgA6FGtwq6J\nvKS1w/FZcDi2ra0RFtAi844/HUftbB1ZBSe93nxHNw10XRzFHtFc6OPFcFfOmhgJ\nnYLbH2xLmwKBgBtD9u/RBHQOcqukXVEDmIW2wIglEjgcjb1UVFeiJIZvYd/dfpB+\nexlkxT00CPIXzh3vnNTsnJsUg/kG4D1ceWIegFh4NxL1NNJMaJwQ5bMhlqDLOkzd\nlMDX2C7YLSyg2+bNP+Yx09vSs1MkNjB6aaMW9uRBFiouuJ6BLBYOEkXXAoGBAJp3\n3Tuc9BmCTDu2xu+959U0i1tKj5ZnVXmmNsJGYc9YcZFfY4nWBt740RzgBEvkGIBb\nDWyYABdPFBTFXyq0B8gEp8cgqefnjaXwTFu6ChxVidEOJT5R/EAjWKUm2/nUQaBx\nBVIqFkR7ooTlf3fHtbOreVlAjZWKpNbNZBwO4G1NAoGAU3HX5fyC+0OWMi22cIha\nyUivxH8ti6JmfNJllRr5V3bHHqKrLCPC8tgAZouvutendjm3beNhTeQGoY2QsNhI\nF5NC8euwpYMLhhwVvTB0G6sBxplZZ3FUNMUtpFQ0qvb6VYNQOa2+qBagemhEPhxs\nBfQH0FnqRjBjGoaX7nw6FnU=\n-----END PRIVATE KEY-----\n",
|
6 |
+
"client_email": "[email protected]",
|
7 |
+
"client_id": "108855684700262853537",
|
8 |
+
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
9 |
+
"token_uri": "https://oauth2.googleapis.com/token",
|
10 |
+
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
|
11 |
+
"client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/demochatbotpropx%40demochatbotpropx.iam.gserviceaccount.com",
|
12 |
+
"universe_domain": "googleapis.com"
|
13 |
+
}
|
src/__pycache__/main.cpython-312.pyc
CHANGED
Binary files a/src/__pycache__/main.cpython-312.pyc and b/src/__pycache__/main.cpython-312.pyc differ
|
|
src/agents/__pycache__/excel_aware_rag.cpython-312.pyc
CHANGED
Binary files a/src/agents/__pycache__/excel_aware_rag.cpython-312.pyc and b/src/agents/__pycache__/excel_aware_rag.cpython-312.pyc differ
|
|
src/agents/__pycache__/rag_agent.cpython-312.pyc
CHANGED
Binary files a/src/agents/__pycache__/rag_agent.cpython-312.pyc and b/src/agents/__pycache__/rag_agent.cpython-312.pyc differ
|
|
src/agents/__pycache__/system_instructions_rag.cpython-312.pyc
ADDED
Binary file (8.58 kB). View file
|
|
src/agents/system_instructions_rag.py
ADDED
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# src/agents/system_instructions_rag.py
|
2 |
+
from typing import List, Dict, Optional
|
3 |
+
from src.agents.rag_agent import RAGResponse
|
4 |
+
from src.utils.logger import logger
|
5 |
+
from src.agents.rag_agent import RAGAgent
|
6 |
+
|
7 |
+
class SystemInstructionsRAGAgent(RAGAgent):
|
8 |
+
"""RAG Agent with enhanced system instructions for specific use cases"""
|
9 |
+
|
10 |
+
async def generate_response(
|
11 |
+
self,
|
12 |
+
query: str,
|
13 |
+
conversation_id: Optional[str] = None,
|
14 |
+
temperature: float = 0.7,
|
15 |
+
max_tokens: Optional[int] = None,
|
16 |
+
context_docs: Optional[List[str]] = None
|
17 |
+
) -> RAGResponse:
|
18 |
+
"""
|
19 |
+
Generate response with specific handling for introduction and no-context cases
|
20 |
+
"""
|
21 |
+
try:
|
22 |
+
# First, check if this is an introduction/welcome message query
|
23 |
+
is_introduction = (
|
24 |
+
"wants support" in query and
|
25 |
+
"This is Introduction" in query and
|
26 |
+
("A new user with name:" in query or "An old user with name:" in query)
|
27 |
+
)
|
28 |
+
|
29 |
+
if is_introduction:
|
30 |
+
# Handle introduction message - no context needed
|
31 |
+
welcome_message = self._handle_contact_query(query)
|
32 |
+
return RAGResponse(
|
33 |
+
response=welcome_message,
|
34 |
+
context_docs=[],
|
35 |
+
sources=[],
|
36 |
+
scores=None
|
37 |
+
)
|
38 |
+
|
39 |
+
# For all other queries, proceed with context-based response
|
40 |
+
if not context_docs:
|
41 |
+
context_docs, sources, scores = await self.retrieve_context(
|
42 |
+
query,
|
43 |
+
conversation_history=[]
|
44 |
+
)
|
45 |
+
|
46 |
+
# Check if we have relevant context
|
47 |
+
has_relevant_context = self._check_context_relevance(query, context_docs or [])
|
48 |
+
|
49 |
+
# If no relevant context found, return the standard message
|
50 |
+
if not has_relevant_context:
|
51 |
+
return RAGResponse(
|
52 |
+
response="Information about this is not available, do you want to inquire about something else?",
|
53 |
+
context_docs=[],
|
54 |
+
sources=[],
|
55 |
+
scores=None
|
56 |
+
)
|
57 |
+
|
58 |
+
# Generate response using context
|
59 |
+
prompt = self._create_response_prompt(query, context_docs)
|
60 |
+
response_text = self.llm.generate(
|
61 |
+
prompt,
|
62 |
+
temperature=temperature,
|
63 |
+
max_tokens=max_tokens
|
64 |
+
)
|
65 |
+
|
66 |
+
# Check if the generated response indicates no information
|
67 |
+
cleaned_response = self._clean_response(response_text)
|
68 |
+
if self._is_no_info_response(cleaned_response):
|
69 |
+
return RAGResponse(
|
70 |
+
response="Information about this is not available, do you want to inquire about something else?",
|
71 |
+
context_docs=[],
|
72 |
+
sources=[],
|
73 |
+
scores=None
|
74 |
+
)
|
75 |
+
|
76 |
+
return RAGResponse(
|
77 |
+
response=cleaned_response,
|
78 |
+
context_docs=context_docs,
|
79 |
+
sources=sources,
|
80 |
+
scores=scores
|
81 |
+
)
|
82 |
+
|
83 |
+
except Exception as e:
|
84 |
+
logger.error(f"Error in SystemInstructionsRAGAgent: {str(e)}")
|
85 |
+
raise
|
86 |
+
|
87 |
+
def _is_no_info_response(self, response: str) -> bool:
|
88 |
+
"""Check if the response indicates no information available"""
|
89 |
+
no_info_indicators = [
|
90 |
+
"i do not have",
|
91 |
+
"i don't have",
|
92 |
+
"no information",
|
93 |
+
"not available",
|
94 |
+
"could not find",
|
95 |
+
"couldn't find",
|
96 |
+
"cannot find"
|
97 |
+
]
|
98 |
+
response_lower = response.lower()
|
99 |
+
return any(indicator in response_lower for indicator in no_info_indicators)
|
100 |
+
|
101 |
+
def _check_context_relevance(self, query: str, context_docs: List[str]) -> bool:
|
102 |
+
"""Check if context contains information relevant to the query"""
|
103 |
+
if not context_docs:
|
104 |
+
return False
|
105 |
+
|
106 |
+
# Extract key terms from query
|
107 |
+
query_words = query.lower().split()
|
108 |
+
stop_words = {'share', 'me', 'a', 'about', 'information', 'what', 'is', 'are', 'the', 'in', 'how', 'why', 'when', 'where'}
|
109 |
+
query_terms = {word for word in query_words if word not in stop_words}
|
110 |
+
|
111 |
+
# Check each context document for relevance
|
112 |
+
for doc in context_docs:
|
113 |
+
if not doc:
|
114 |
+
continue
|
115 |
+
doc_lower = doc.lower()
|
116 |
+
if any(term in doc_lower for term in query_terms):
|
117 |
+
# Found relevant content
|
118 |
+
return True
|
119 |
+
return False
|
120 |
+
|
121 |
+
def _create_response_prompt(self, query: str, context_docs: List[str]) -> str:
|
122 |
+
"""Create prompt for generating response from context"""
|
123 |
+
formatted_context = '\n\n'.join(
|
124 |
+
f"Context {i+1}:\n{doc.strip()}"
|
125 |
+
for i, doc in enumerate(context_docs)
|
126 |
+
if doc and doc.strip()
|
127 |
+
)
|
128 |
+
|
129 |
+
return f"""
|
130 |
+
Use ONLY the following context to provide information about: {query}
|
131 |
+
|
132 |
+
{formatted_context}
|
133 |
+
|
134 |
+
Instructions:
|
135 |
+
1. Use ONLY information present in the context above
|
136 |
+
2. If the information is found in the context, provide a direct and concise response
|
137 |
+
3. Do not make assumptions or add information not present in the context
|
138 |
+
4. Ensure the response is clear and complete based on available information
|
139 |
+
5. If you cannot find relevant information about the specific query in the context,
|
140 |
+
respond exactly with: "Information about this is not available, do you want to inquire about something else?"
|
141 |
+
|
142 |
+
Query: {query}
|
143 |
+
Response:"""
|
144 |
+
|
145 |
+
def _handle_contact_query(self, query: str) -> str:
|
146 |
+
"""Handle queries from /user/contact endpoint"""
|
147 |
+
try:
|
148 |
+
name_start = query.find('name: "') + 7
|
149 |
+
name_end = query.find('"', name_start)
|
150 |
+
name = query[name_start:name_end] if name_start > 6 and name_end != -1 else "there"
|
151 |
+
|
152 |
+
is_returning = (
|
153 |
+
"An old user with name:" in query and
|
154 |
+
"wants support again" in query
|
155 |
+
)
|
156 |
+
|
157 |
+
if is_returning:
|
158 |
+
return f"Welcome back {name}, How can I help you?"
|
159 |
+
return f"Welcome {name}, How can I help you?"
|
160 |
+
|
161 |
+
except Exception as e:
|
162 |
+
logger.error(f"Error handling contact query: {str(e)}")
|
163 |
+
return "Welcome, How can I help you?"
|
164 |
+
|
165 |
+
def _clean_response(self, response: str) -> str:
|
166 |
+
"""Clean response by removing unwanted phrases"""
|
167 |
+
if not response:
|
168 |
+
return response
|
169 |
+
|
170 |
+
phrases_to_remove = [
|
171 |
+
"Based on the context provided,",
|
172 |
+
"According to the documents,",
|
173 |
+
"From the information available,",
|
174 |
+
"I can tell you that",
|
175 |
+
"Let me help you with that",
|
176 |
+
"I understand you're asking about",
|
177 |
+
"To answer your question,",
|
178 |
+
"The documents indicate that",
|
179 |
+
"Based on the available information,",
|
180 |
+
"As per the provided context,",
|
181 |
+
"I would be happy to help you with that",
|
182 |
+
"Let me provide you with information about",
|
183 |
+
"Here's what I found:",
|
184 |
+
"Here's the information you requested:",
|
185 |
+
"According to the provided information,",
|
186 |
+
"Based on the documents,",
|
187 |
+
"The information suggests that",
|
188 |
+
"From what I can see,",
|
189 |
+
"Let me explain",
|
190 |
+
"To clarify,",
|
191 |
+
"It appears that",
|
192 |
+
"I can see that",
|
193 |
+
"Sure,",
|
194 |
+
"Well,",
|
195 |
+
"Based on the given context,",
|
196 |
+
"The available information shows that",
|
197 |
+
"From the context provided,",
|
198 |
+
"The documentation mentions that",
|
199 |
+
"According to the context,",
|
200 |
+
"As shown in the context,",
|
201 |
+
"I apologize,"
|
202 |
+
]
|
203 |
+
|
204 |
+
cleaned_response = response
|
205 |
+
for phrase in phrases_to_remove:
|
206 |
+
cleaned_response = cleaned_response.replace(phrase, "").strip()
|
207 |
+
|
208 |
+
cleaned_response = " ".join(cleaned_response.split())
|
209 |
+
|
210 |
+
if not cleaned_response:
|
211 |
+
return response
|
212 |
+
|
213 |
+
if cleaned_response[0].islower():
|
214 |
+
cleaned_response = cleaned_response[0].upper() + cleaned_response[1:]
|
215 |
+
|
216 |
+
return cleaned_response
|
src/db/__pycache__/mongodb_store.cpython-312.pyc
CHANGED
Binary files a/src/db/__pycache__/mongodb_store.cpython-312.pyc and b/src/db/__pycache__/mongodb_store.cpython-312.pyc differ
|
|
src/main.py
CHANGED
@@ -8,12 +8,23 @@ import uuid
|
|
8 |
from datetime import datetime
|
9 |
from pathlib import Path
|
10 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
# Import custom modules1
|
13 |
-
from src.agents.rag_agent import RAGAgent
|
|
|
14 |
from src.models.document import AllDocumentsResponse, StoredDocument
|
15 |
from src.models.UserContact import UserContactRequest
|
16 |
from src.utils.document_processor import DocumentProcessor
|
|
|
17 |
from src.utils.conversation_summarizer import ConversationSummarizer
|
18 |
from src.utils.logger import logger
|
19 |
from src.utils.llm_utils import get_llm_instance, get_vector_store
|
@@ -43,6 +54,8 @@ app.add_middleware(
|
|
43 |
allow_headers=["*"], # Allows all headers
|
44 |
)
|
45 |
|
|
|
|
|
46 |
# Initialize MongoDB
|
47 |
mongodb = MongoDBStore(settings.MONGODB_URI)
|
48 |
|
@@ -70,6 +83,43 @@ async def verify_api_key(api_key: str = Depends(API_KEY_HEADER)):
|
|
70 |
)
|
71 |
return api_key
|
72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
@app.get("/documents")
|
74 |
async def get_all_documents():
|
75 |
"""Get all documents from MongoDB"""
|
@@ -147,7 +197,8 @@ async def upload_documents(
|
|
147 |
logger.error(f"Error in document upload: {str(e)}")
|
148 |
raise HTTPException(status_code=500, detail=str(e))
|
149 |
|
150 |
-
|
|
|
151 |
async def get_document_chunks(document_id: str):
|
152 |
"""Get all chunks for a specific document"""
|
153 |
try:
|
@@ -207,8 +258,31 @@ async def delete_document(document_id: str):
|
|
207 |
logger.error(f"Error in delete_document endpoint: {str(e)}")
|
208 |
raise HTTPException(status_code=500, detail=str(e))
|
209 |
|
210 |
-
|
211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
@app.post("/user/contact", response_model=ChatResponse)
|
213 |
async def create_user_contact(
|
214 |
request: UserContactRequest,
|
@@ -224,7 +298,7 @@ async def create_user_contact(
|
|
224 |
|
225 |
if existing_conversation_id:
|
226 |
chat_request = ChatRequest(
|
227 |
-
query=f'An old user with name: "{request.full_name}", email: "{request.email}" and phone number: "{request.phone_number}" wants support again. Create a welcome back message for him and ask how i can help you today?',
|
228 |
llm_provider="openai",
|
229 |
max_context_docs=3,
|
230 |
temperature=1.0,
|
@@ -242,7 +316,7 @@ async def create_user_contact(
|
|
242 |
)
|
243 |
|
244 |
chat_request = ChatRequest(
|
245 |
-
query=f'A new user with name: "{request.full_name}", email: "{request.email}" and phone number: "{request.phone_number}" wants support. Create a welcome message for him and ask how i can help you today?',
|
246 |
llm_provider="openai",
|
247 |
max_context_docs=3,
|
248 |
temperature=1.0,
|
@@ -272,13 +346,40 @@ async def chat_endpoint(
|
|
272 |
llm = get_llm_instance(request.llm_provider)
|
273 |
|
274 |
# Initialize RAG agent
|
275 |
-
rag_agent = RAGAgent(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
276 |
llm=llm,
|
277 |
embedding=embedding_model,
|
278 |
vector_store=vector_store,
|
279 |
mongodb=mongodb
|
280 |
)
|
281 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
282 |
# Use provided conversation ID or create new one
|
283 |
conversation_id = request.conversation_id or str(uuid.uuid4())
|
284 |
|
@@ -287,7 +388,7 @@ async def chat_endpoint(
|
|
287 |
|
288 |
# Add specific instructions for certain types of queries
|
289 |
#if "introduce" in query.lower() or "name" in query.lower() or "email" in query.lower():
|
290 |
-
query += ". The response should be short and to the point. Make sure to not add any irrelevant information. Keep the introduction concise and friendly."
|
291 |
|
292 |
# Generate response
|
293 |
logger.info(f"Generating response: {str(datetime.now())}")
|
|
|
8 |
from datetime import datetime
|
9 |
from pathlib import Path
|
10 |
import os
|
11 |
+
import asyncio
|
12 |
+
os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1'
|
13 |
+
#os.environ["OAUTHLIB_RELAX_TOKEN_SCOPE"] = "1"
|
14 |
+
|
15 |
+
|
16 |
+
from fastapi.responses import RedirectResponse
|
17 |
+
from google.oauth2.credentials import Credentials
|
18 |
+
from google_auth_oauthlib.flow import Flow
|
19 |
+
from src.utils.google_drive_service import GoogleDriveService
|
20 |
|
21 |
# Import custom modules1
|
22 |
+
#from src.agents.rag_agent import RAGAgent
|
23 |
+
from src.agents.system_instructions_rag import SystemInstructionsRAGAgent
|
24 |
from src.models.document import AllDocumentsResponse, StoredDocument
|
25 |
from src.models.UserContact import UserContactRequest
|
26 |
from src.utils.document_processor import DocumentProcessor
|
27 |
+
from src.utils.drive_document_processor import DriveDocumentProcessor
|
28 |
from src.utils.conversation_summarizer import ConversationSummarizer
|
29 |
from src.utils.logger import logger
|
30 |
from src.utils.llm_utils import get_llm_instance, get_vector_store
|
|
|
54 |
allow_headers=["*"], # Allows all headers
|
55 |
)
|
56 |
|
57 |
+
#google_drive_service = GoogleDriveService()
|
58 |
+
|
59 |
# Initialize MongoDB
|
60 |
mongodb = MongoDBStore(settings.MONGODB_URI)
|
61 |
|
|
|
83 |
)
|
84 |
return api_key
|
85 |
|
86 |
+
# @app.get("/google/auth")
|
87 |
+
# async def google_auth():
|
88 |
+
# authorization_url, _ = settings.google_oauth_flow.authorization_url(
|
89 |
+
# access_type='offline',
|
90 |
+
# prompt='consent',
|
91 |
+
# include_granted_scopes='true'
|
92 |
+
# )
|
93 |
+
# return RedirectResponse(authorization_url)
|
94 |
+
|
95 |
+
# @app.get("/google/oauth2callback")
|
96 |
+
# async def google_auth_callback(code: str):
|
97 |
+
# flow = Flow.from_client_config({
|
98 |
+
# "web": {
|
99 |
+
# "client_id": settings.GOOGLE_OAUTH_CLIENT_ID,
|
100 |
+
# "client_secret": settings.GOOGLE_OAUTH_CLIENT_SECRET,
|
101 |
+
# "auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
102 |
+
# "token_uri": "https://oauth2.googleapis.com/token",
|
103 |
+
# "redirect_uris": [settings.GOOGLE_OAUTH_REDIRECT_URI]
|
104 |
+
# }
|
105 |
+
# }, scopes=['https://www.googleapis.com/auth/drive.readonly'])
|
106 |
+
|
107 |
+
# flow.redirect_uri = settings.GOOGLE_OAUTH_REDIRECT_URI
|
108 |
+
|
109 |
+
# # Add access type and prompt parameters for refresh token
|
110 |
+
# flow.fetch_token(
|
111 |
+
# code=code,
|
112 |
+
# access_type='offline',
|
113 |
+
# prompt='consent'
|
114 |
+
# )
|
115 |
+
# credentials = flow.credentials
|
116 |
+
|
117 |
+
# return {
|
118 |
+
# "message": "Authentication successful",
|
119 |
+
# "credentials": credentials.to_json()
|
120 |
+
# }
|
121 |
+
|
122 |
+
|
123 |
@app.get("/documents")
|
124 |
async def get_all_documents():
|
125 |
"""Get all documents from MongoDB"""
|
|
|
197 |
logger.error(f"Error in document upload: {str(e)}")
|
198 |
raise HTTPException(status_code=500, detail=str(e))
|
199 |
|
200 |
+
|
201 |
+
@app.get("/documentChunks/{document_id}")
|
202 |
async def get_document_chunks(document_id: str):
|
203 |
"""Get all chunks for a specific document"""
|
204 |
try:
|
|
|
258 |
logger.error(f"Error in delete_document endpoint: {str(e)}")
|
259 |
raise HTTPException(status_code=500, detail=str(e))
|
260 |
|
261 |
+
@app.post("/processDriveDocuments")
|
262 |
+
async def process_drive_documents():
|
263 |
+
try:
|
264 |
+
# Initialize vector store
|
265 |
+
vector_store, _ = await get_vector_store()
|
266 |
+
|
267 |
+
# Initialize Drive document processor
|
268 |
+
drive_processor = DriveDocumentProcessor(
|
269 |
+
google_service_account_path=settings.GOOGLE_SERVICE_ACCOUNT_PATH,
|
270 |
+
folder_id=settings.GOOGLE_DRIVE_FOLDER_ID,
|
271 |
+
temp_dir=settings.TEMP_DOWNLOAD_DIR,
|
272 |
+
doc_processor=doc_processor
|
273 |
+
)
|
274 |
+
|
275 |
+
# Process documents
|
276 |
+
result = await drive_processor.process_documents(vector_store)
|
277 |
+
return result
|
278 |
+
|
279 |
+
except Exception as e:
|
280 |
+
logger.error(f"Error in process_drive_documents: {str(e)}")
|
281 |
+
raise HTTPException(
|
282 |
+
status_code=500,
|
283 |
+
detail=str(e)
|
284 |
+
)
|
285 |
+
|
286 |
@app.post("/user/contact", response_model=ChatResponse)
|
287 |
async def create_user_contact(
|
288 |
request: UserContactRequest,
|
|
|
298 |
|
299 |
if existing_conversation_id:
|
300 |
chat_request = ChatRequest(
|
301 |
+
query=f'An old user with name: "{request.full_name}", email: "{request.email}" and phone number: "{request.phone_number}" wants support again. This is Introduction Create a welcome back message for him and ask how i can help you today?',
|
302 |
llm_provider="openai",
|
303 |
max_context_docs=3,
|
304 |
temperature=1.0,
|
|
|
316 |
)
|
317 |
|
318 |
chat_request = ChatRequest(
|
319 |
+
query=f'A new user with name: "{request.full_name}", email: "{request.email}" and phone number: "{request.phone_number}" wants support. This is Introduction Create a welcome message for him and ask how i can help you today?',
|
320 |
llm_provider="openai",
|
321 |
max_context_docs=3,
|
322 |
temperature=1.0,
|
|
|
346 |
llm = get_llm_instance(request.llm_provider)
|
347 |
|
348 |
# Initialize RAG agent
|
349 |
+
# rag_agent = RAGAgent(
|
350 |
+
# llm=llm,
|
351 |
+
# embedding=embedding_model,
|
352 |
+
# vector_store=vector_store,
|
353 |
+
# mongodb=mongodb
|
354 |
+
# )
|
355 |
+
|
356 |
+
rag_agent = SystemInstructionsRAGAgent(
|
357 |
llm=llm,
|
358 |
embedding=embedding_model,
|
359 |
vector_store=vector_store,
|
360 |
mongodb=mongodb
|
361 |
)
|
362 |
|
363 |
+
# rag_agent.add_custom_role(
|
364 |
+
# "Knowledge based chatbot and introduction specialist",
|
365 |
+
# """You are a welcome agent with knowledge based specialist focusing on knowledge attached and create a beautiful welcome message.
|
366 |
+
# Your role is to:
|
367 |
+
# 1. Your response should be short and to the point.
|
368 |
+
# 2. Strictly follow this point for If it is an introduction. You strictly respond that "Welcome name of customer to our platform. How can I help you today?"
|
369 |
+
# """
|
370 |
+
# )
|
371 |
+
|
372 |
+
# rag_agent.add_custom_role(
|
373 |
+
# "Knowledge based chatbot",
|
374 |
+
# """You are a knowledge based specialist focusing on knowledge attached.
|
375 |
+
# Your role is to:
|
376 |
+
# 1. Your response should be short and to the point.
|
377 |
+
# 2. if it is not introduction then make sure to share the response from Vector store.
|
378 |
+
# 3. If you do not find relevant information. Just say I do not have this information but this do not apply to introduction message.
|
379 |
+
# 4. If there is an introduction, you should ignore above roles and connect with LLm to have a welcome message for the user.
|
380 |
+
# """
|
381 |
+
# )
|
382 |
+
|
383 |
# Use provided conversation ID or create new one
|
384 |
conversation_id = request.conversation_id or str(uuid.uuid4())
|
385 |
|
|
|
388 |
|
389 |
# Add specific instructions for certain types of queries
|
390 |
#if "introduce" in query.lower() or "name" in query.lower() or "email" in query.lower():
|
391 |
+
#query += ". The response should be short and to the point. Make sure to not add any irrelevant information. make sure to share the response from Vector store, if you do not find information in vector store. Just respond I do not have information. Keep the introduction concise and friendly."
|
392 |
|
393 |
# Generate response
|
394 |
logger.info(f"Generating response: {str(datetime.now())}")
|
src/models/__pycache__/UserContact.cpython-312.pyc
CHANGED
Binary files a/src/models/__pycache__/UserContact.cpython-312.pyc and b/src/models/__pycache__/UserContact.cpython-312.pyc differ
|
|
src/models/__pycache__/chat.cpython-312.pyc
CHANGED
Binary files a/src/models/__pycache__/chat.cpython-312.pyc and b/src/models/__pycache__/chat.cpython-312.pyc differ
|
|
src/models/__pycache__/rag.cpython-312.pyc
CHANGED
Binary files a/src/models/__pycache__/rag.cpython-312.pyc and b/src/models/__pycache__/rag.cpython-312.pyc differ
|
|
src/utils/__pycache__/conversation_manager.cpython-312.pyc
CHANGED
Binary files a/src/utils/__pycache__/conversation_manager.cpython-312.pyc and b/src/utils/__pycache__/conversation_manager.cpython-312.pyc differ
|
|
src/utils/__pycache__/database_cleanup.cpython-312.pyc
CHANGED
Binary files a/src/utils/__pycache__/database_cleanup.cpython-312.pyc and b/src/utils/__pycache__/database_cleanup.cpython-312.pyc differ
|
|
src/utils/__pycache__/document_processor.cpython-312.pyc
CHANGED
Binary files a/src/utils/__pycache__/document_processor.cpython-312.pyc and b/src/utils/__pycache__/document_processor.cpython-312.pyc differ
|
|
src/utils/__pycache__/drive_document_processor.cpython-312.pyc
ADDED
Binary file (10.8 kB). View file
|
|
src/utils/__pycache__/enhanced_excel_processor.cpython-312.pyc
CHANGED
Binary files a/src/utils/__pycache__/enhanced_excel_processor.cpython-312.pyc and b/src/utils/__pycache__/enhanced_excel_processor.cpython-312.pyc differ
|
|
src/utils/__pycache__/google_drive_service.cpython-312.pyc
ADDED
Binary file (3.94 kB). View file
|
|
src/utils/__pycache__/llm_utils.cpython-312.pyc
CHANGED
Binary files a/src/utils/__pycache__/llm_utils.cpython-312.pyc and b/src/utils/__pycache__/llm_utils.cpython-312.pyc differ
|
|
src/utils/drive_document_processor.py
ADDED
@@ -0,0 +1,315 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# src/utils/drive_document_processor.py
|
2 |
+
from pathlib import Path
|
3 |
+
from typing import Dict, List, Any, Tuple
|
4 |
+
import logging
|
5 |
+
from fastapi import HTTPException
|
6 |
+
|
7 |
+
from src.utils.google_drive_service import GoogleDriveService
|
8 |
+
from src.utils.document_processor import DocumentProcessor
|
9 |
+
from src.vectorstores.chroma_vectorstore import ChromaVectorStore
|
10 |
+
from src.utils.logger import logger
|
11 |
+
|
12 |
+
class DriveDocumentProcessor:
|
13 |
+
def __init__(
|
14 |
+
self,
|
15 |
+
google_service_account_path: str,
|
16 |
+
folder_id: str,
|
17 |
+
temp_dir: str,
|
18 |
+
doc_processor: DocumentProcessor
|
19 |
+
):
|
20 |
+
"""
|
21 |
+
Initialize Drive Document Processor
|
22 |
+
|
23 |
+
Args:
|
24 |
+
google_service_account_path (str): Path to Google service account credentials
|
25 |
+
folder_id (str): Google Drive folder ID to process
|
26 |
+
temp_dir (str): Directory for temporary files
|
27 |
+
doc_processor (DocumentProcessor): Instance of DocumentProcessor
|
28 |
+
"""
|
29 |
+
self.google_drive_service = GoogleDriveService(google_service_account_path)
|
30 |
+
self.folder_id = folder_id
|
31 |
+
self.temp_dir = Path(temp_dir)
|
32 |
+
self.doc_processor = doc_processor
|
33 |
+
|
34 |
+
# Create temp directory if it doesn't exist
|
35 |
+
self.temp_dir.mkdir(exist_ok=True)
|
36 |
+
|
37 |
+
# Define supported MIME types
|
38 |
+
self.supported_mime_types = {
|
39 |
+
# Google Docs
|
40 |
+
'application/vnd.google-apps.document': '.docx', # Export Google Docs as DOCX
|
41 |
+
|
42 |
+
# Microsoft Word Documents
|
43 |
+
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
|
44 |
+
'application/msword': '.doc',
|
45 |
+
|
46 |
+
# Microsoft Excel Documents
|
47 |
+
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
|
48 |
+
'application/vnd.ms-excel': '.xls',
|
49 |
+
|
50 |
+
# Text Documents
|
51 |
+
'text/plain': '.txt',
|
52 |
+
'text/csv': '.csv',
|
53 |
+
'text/markdown': '.md',
|
54 |
+
'text/html': '.html',
|
55 |
+
'text/xml': '.xml',
|
56 |
+
'application/json': '.json',
|
57 |
+
'application/rtf': '.rtf',
|
58 |
+
|
59 |
+
# PDF Documents
|
60 |
+
'application/pdf': '.pdf'
|
61 |
+
}
|
62 |
+
|
63 |
+
# Define export MIME types for Google Docs formats
|
64 |
+
self.google_docs_export_types = {
|
65 |
+
'application/vnd.google-apps.document': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
66 |
+
}
|
67 |
+
|
68 |
+
async def process_documents(
|
69 |
+
self,
|
70 |
+
vector_store: ChromaVectorStore
|
71 |
+
) -> Dict[str, Any]:
|
72 |
+
"""
|
73 |
+
Process all documents in the specified Drive folder
|
74 |
+
|
75 |
+
Args:
|
76 |
+
vector_store (ChromaVectorStore): Vector store instance
|
77 |
+
|
78 |
+
Returns:
|
79 |
+
Dict[str, Any]: Processing results
|
80 |
+
"""
|
81 |
+
try:
|
82 |
+
# Get documents from folder
|
83 |
+
files = self.google_drive_service.get_folder_contents(self.folder_id)
|
84 |
+
|
85 |
+
processed_files = []
|
86 |
+
skipped_files = []
|
87 |
+
errors = []
|
88 |
+
|
89 |
+
for file in files:
|
90 |
+
result = await self._process_single_file(file, vector_store)
|
91 |
+
|
92 |
+
if result['status'] == 'processed':
|
93 |
+
processed_files.append(result['data'])
|
94 |
+
elif result['status'] == 'skipped':
|
95 |
+
skipped_files.append(result['data'])
|
96 |
+
else: # status == 'error'
|
97 |
+
errors.append(result['data'])
|
98 |
+
|
99 |
+
# Clean up temporary directory if empty
|
100 |
+
self._cleanup_temp_dir()
|
101 |
+
|
102 |
+
return {
|
103 |
+
"status": "completed",
|
104 |
+
"processed_files": {
|
105 |
+
"count": len(processed_files),
|
106 |
+
"details": processed_files
|
107 |
+
},
|
108 |
+
"skipped_files": {
|
109 |
+
"count": len(skipped_files),
|
110 |
+
"details": skipped_files
|
111 |
+
},
|
112 |
+
"errors": {
|
113 |
+
"count": len(errors),
|
114 |
+
"details": errors
|
115 |
+
}
|
116 |
+
}
|
117 |
+
|
118 |
+
except Exception as e:
|
119 |
+
logger.error(f"Error processing Drive documents: {str(e)}")
|
120 |
+
raise HTTPException(
|
121 |
+
status_code=500,
|
122 |
+
detail=f"Failed to process drive documents: {str(e)}"
|
123 |
+
)
|
124 |
+
|
125 |
+
async def _process_single_file(
|
126 |
+
self,
|
127 |
+
file: Dict[str, Any],
|
128 |
+
vector_store: ChromaVectorStore
|
129 |
+
) -> Dict[str, Any]:
|
130 |
+
"""Process a single Drive file"""
|
131 |
+
|
132 |
+
mime_type = file.get('mimeType', '')
|
133 |
+
|
134 |
+
# Skip if mime type not supported
|
135 |
+
if mime_type not in self.supported_mime_types:
|
136 |
+
return {
|
137 |
+
'status': 'skipped',
|
138 |
+
'data': {
|
139 |
+
'name': file['name'],
|
140 |
+
'reason': f'Unsupported mime type: {mime_type}'
|
141 |
+
}
|
142 |
+
}
|
143 |
+
|
144 |
+
try:
|
145 |
+
document_id = file['id']
|
146 |
+
modified_time = file.get('modifiedTime', 'N/A') # Get last modified time
|
147 |
+
|
148 |
+
# Check if document should be processed
|
149 |
+
if self.save_document(document_id, vector_store, modified_time):
|
150 |
+
# Download and process file
|
151 |
+
temp_file_path = await self._download_and_save_file(
|
152 |
+
file['id'],
|
153 |
+
mime_type
|
154 |
+
)
|
155 |
+
|
156 |
+
try:
|
157 |
+
# Process document
|
158 |
+
processed_doc = await self.doc_processor.process_document(
|
159 |
+
str(temp_file_path)
|
160 |
+
)
|
161 |
+
|
162 |
+
# Add to vector store
|
163 |
+
self._add_to_vector_store(
|
164 |
+
processed_doc['chunks'],
|
165 |
+
file,
|
166 |
+
mime_type,
|
167 |
+
vector_store
|
168 |
+
)
|
169 |
+
|
170 |
+
return {
|
171 |
+
'status': 'processed',
|
172 |
+
'data': {
|
173 |
+
'name': file['name'],
|
174 |
+
'id': file['id'],
|
175 |
+
'chunks_processed': len(processed_doc['chunks'])
|
176 |
+
}
|
177 |
+
}
|
178 |
+
|
179 |
+
finally:
|
180 |
+
# Clean up temporary file
|
181 |
+
if temp_file_path.exists():
|
182 |
+
temp_file_path.unlink()
|
183 |
+
else:
|
184 |
+
# Return skipped status if document already exists and is up to date
|
185 |
+
return {
|
186 |
+
'status': 'skipped',
|
187 |
+
'data': {
|
188 |
+
'name': file['name'],
|
189 |
+
'reason': 'Document already exists in the memory.'
|
190 |
+
}
|
191 |
+
}
|
192 |
+
|
193 |
+
except Exception as e:
|
194 |
+
logger.error(f"Error processing file {file['name']}: {str(e)}")
|
195 |
+
return {
|
196 |
+
'status': 'error',
|
197 |
+
'data': {
|
198 |
+
'file_name': file['name'],
|
199 |
+
'error': str(e)
|
200 |
+
}
|
201 |
+
}
|
202 |
+
|
203 |
+
except Exception as e:
|
204 |
+
logger.error(f"Error processing file {file['name']}: {str(e)}")
|
205 |
+
return {
|
206 |
+
'status': 'error',
|
207 |
+
'data': {
|
208 |
+
'file_name': file['name'],
|
209 |
+
'error': str(e)
|
210 |
+
}
|
211 |
+
}
|
212 |
+
|
213 |
+
async def _download_and_save_file(
|
214 |
+
self,
|
215 |
+
file_id: str,
|
216 |
+
mime_type: str
|
217 |
+
) -> Path:
|
218 |
+
"""Download and save file to temporary location"""
|
219 |
+
extension = self.supported_mime_types[mime_type]
|
220 |
+
temp_file_path = self.temp_dir / f"{file_id}{extension}"
|
221 |
+
|
222 |
+
if mime_type in self.google_docs_export_types:
|
223 |
+
# Download Google Doc in the specified export format
|
224 |
+
content = self.google_drive_service.export_file(
|
225 |
+
file_id,
|
226 |
+
self.google_docs_export_types[mime_type]
|
227 |
+
)
|
228 |
+
else:
|
229 |
+
# Download regular file
|
230 |
+
content = self.google_drive_service.download_file(file_id)
|
231 |
+
|
232 |
+
with open(temp_file_path, 'wb') as f:
|
233 |
+
if isinstance(content, str):
|
234 |
+
f.write(content.encode('utf-8'))
|
235 |
+
else:
|
236 |
+
f.write(content)
|
237 |
+
|
238 |
+
return temp_file_path
|
239 |
+
|
240 |
+
def _add_to_vector_store(
|
241 |
+
self,
|
242 |
+
chunks: List[str],
|
243 |
+
file: Dict[str, Any],
|
244 |
+
mime_type: str,
|
245 |
+
vector_store: ChromaVectorStore
|
246 |
+
) -> None:
|
247 |
+
"""Add processed chunks to vector store"""
|
248 |
+
chunk_metadatas = []
|
249 |
+
chunk_ids = []
|
250 |
+
|
251 |
+
# document_id = file['id']
|
252 |
+
modified_time = file.get('modifiedTime', 'N/A') # Get last modified time
|
253 |
+
#self.delete_updated_document(document_id, vector_store, modified_time)
|
254 |
+
|
255 |
+
|
256 |
+
for i, chunk in enumerate(chunks):
|
257 |
+
chunk_id = f"{file['id']}-chunk-{i}"
|
258 |
+
chunk_ids.append(chunk_id)
|
259 |
+
chunk_metadatas.append({
|
260 |
+
"source": file['name'],
|
261 |
+
"document_id": file['id'],
|
262 |
+
"chunk_index": i,
|
263 |
+
"mime_type": mime_type,
|
264 |
+
"modified_time": modified_time,
|
265 |
+
"total_chunks": len(chunks),
|
266 |
+
"file_type": self.supported_mime_types[mime_type],
|
267 |
+
"is_google_doc": mime_type.startswith('application/vnd.google-apps')
|
268 |
+
})
|
269 |
+
|
270 |
+
vector_store.add_documents(
|
271 |
+
documents=chunks,
|
272 |
+
metadatas=chunk_metadatas,
|
273 |
+
ids=chunk_ids
|
274 |
+
)
|
275 |
+
|
276 |
+
def save_document(self, document_id: str, vector_store: ChromaVectorStore, modified_date: str) -> bool:
|
277 |
+
"""
|
278 |
+
Deletes all chunks of a document if the modified_time does not match the given modified_date.
|
279 |
+
|
280 |
+
Args:
|
281 |
+
document_id (str): The ID of the document.
|
282 |
+
vector_store (ChromaVectorStore): The Chroma vector store instance.
|
283 |
+
modified_date (str): The expected modification date.
|
284 |
+
"""
|
285 |
+
try:
|
286 |
+
# Retrieve all chunks for the given document_id
|
287 |
+
chunks = vector_store.get_document_chunks(document_id)
|
288 |
+
|
289 |
+
if not chunks:
|
290 |
+
logging.warning(f"No chunks found for document_id: {document_id}. Nothing to delete.")
|
291 |
+
return True
|
292 |
+
|
293 |
+
# Check the modified_time of the first chunk
|
294 |
+
first_chunk_metadata = chunks[0].get("metadata", {})
|
295 |
+
|
296 |
+
if first_chunk_metadata.get("modified_time") != modified_date:
|
297 |
+
# If modified_time doesn't match, delete all chunks
|
298 |
+
vector_store.delete_document(document_id)
|
299 |
+
logging.info(f"Deleted all chunks for document_id: {document_id} due to modified_time mismatch.")
|
300 |
+
return True
|
301 |
+
else:
|
302 |
+
logging.info(f"No deletion needed for document_id: {document_id}, modified_time is unchanged.")
|
303 |
+
return False
|
304 |
+
|
305 |
+
|
306 |
+
except Exception as e:
|
307 |
+
logging.error(f"Error while deleting chunks for document_id {document_id}: {str(e)}")
|
308 |
+
return True
|
309 |
+
|
310 |
+
|
311 |
+
|
312 |
+
def _cleanup_temp_dir(self) -> None:
|
313 |
+
"""Clean up temporary directory if empty"""
|
314 |
+
if self.temp_dir.exists() and not any(self.temp_dir.iterdir()):
|
315 |
+
self.temp_dir.rmdir()
|
src/utils/google_drive_service.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# src/utils/google_drive_service.py
|
2 |
+
from google.oauth2 import service_account
|
3 |
+
from googleapiclient.discovery import build
|
4 |
+
from googleapiclient.http import MediaIoBaseDownload
|
5 |
+
import io
|
6 |
+
import os
|
7 |
+
|
8 |
+
class GoogleDriveService:
|
9 |
+
def __init__(self, credentials_path: str):
|
10 |
+
"""
|
11 |
+
Initialize Google Drive service
|
12 |
+
|
13 |
+
Args:
|
14 |
+
credentials_path (str): Path to service account credentials file
|
15 |
+
"""
|
16 |
+
self.credentials = service_account.Credentials.from_service_account_file(
|
17 |
+
credentials_path,
|
18 |
+
scopes=['https://www.googleapis.com/auth/drive.readonly']
|
19 |
+
)
|
20 |
+
self.service = build('drive', 'v3', credentials=self.credentials)
|
21 |
+
|
22 |
+
def get_folder_contents(self, folder_id: str):
|
23 |
+
"""
|
24 |
+
Get contents of a Drive folder
|
25 |
+
|
26 |
+
Args:
|
27 |
+
folder_id (str): ID of the folder to process
|
28 |
+
|
29 |
+
Returns:
|
30 |
+
List[Dict]: List of file metadata
|
31 |
+
"""
|
32 |
+
query = f"'{folder_id}' in parents and trashed=false"
|
33 |
+
results = self.service.files().list(
|
34 |
+
q=query,
|
35 |
+
fields="files(id, name, mimeType,modifiedTime)",
|
36 |
+
supportsAllDrives=True,
|
37 |
+
includeItemsFromAllDrives=True
|
38 |
+
).execute()
|
39 |
+
return results.get('files', [])
|
40 |
+
|
41 |
+
def download_file(self, file_id: str) -> bytes:
|
42 |
+
"""
|
43 |
+
Download a file from Drive
|
44 |
+
|
45 |
+
Args:
|
46 |
+
file_id (str): ID of the file to download
|
47 |
+
|
48 |
+
Returns:
|
49 |
+
bytes: File content
|
50 |
+
"""
|
51 |
+
request = self.service.files().get_media(fileId=file_id)
|
52 |
+
content = io.BytesIO()
|
53 |
+
downloader = MediaIoBaseDownload(content, request)
|
54 |
+
|
55 |
+
done = False
|
56 |
+
while not done:
|
57 |
+
_, done = downloader.next_chunk()
|
58 |
+
|
59 |
+
content.seek(0)
|
60 |
+
return content.read()
|
61 |
+
|
62 |
+
def export_file(self, file_id: str, mime_type: str) -> bytes:
|
63 |
+
"""
|
64 |
+
Export a Google Workspace file to a different format
|
65 |
+
|
66 |
+
Args:
|
67 |
+
file_id (str): ID of the file to export
|
68 |
+
mime_type (str): MIME type to export to
|
69 |
+
|
70 |
+
Returns:
|
71 |
+
bytes: Exported file content
|
72 |
+
"""
|
73 |
+
request = self.service.files().export_media(
|
74 |
+
fileId=file_id,
|
75 |
+
mimeType=mime_type
|
76 |
+
)
|
77 |
+
content = io.BytesIO()
|
78 |
+
downloader = MediaIoBaseDownload(content, request)
|
79 |
+
|
80 |
+
done = False
|
81 |
+
while not done:
|
82 |
+
_, done = downloader.next_chunk()
|
83 |
+
|
84 |
+
content.seek(0)
|
85 |
+
return content.read()
|
src/vectorstores/__pycache__/optimized_vectorstore.cpython-312.pyc
CHANGED
Binary files a/src/vectorstores/__pycache__/optimized_vectorstore.cpython-312.pyc and b/src/vectorstores/__pycache__/optimized_vectorstore.cpython-312.pyc differ
|
|
testfile.txt
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
testing123
|
|
|
|