Spaces:
Running
Running
Commit
·
acdfaa9
1
Parent(s):
1a54bda
Log google drive documents in the mongodb, add source of the document and made chunks to overlap text.
Browse files- Install your driver.txt +19 -0
- config/__pycache__/config.cpython-312.pyc +0 -0
- config/config.py +2 -0
- src/__pycache__/main.cpython-312.pyc +0 -0
- src/agents/__pycache__/rag_agent.cpython-312.pyc +0 -0
- src/agents/__pycache__/system_instructions_rag.cpython-312.pyc +0 -0
- src/agents/rag_agent.py +66 -32
- src/db/__pycache__/mongodb_store.cpython-312.pyc +0 -0
- src/db/mongodb_store.py +38 -33
- src/implementations/__pycache__/document_service.cpython-312.pyc +0 -0
- src/implementations/document_service.py +41 -33
- src/main.py +4 -2
- src/utils/__pycache__/document_processor.cpython-312.pyc +0 -0
- src/utils/__pycache__/drive_document_processor.cpython-312.pyc +0 -0
- src/utils/document_processor.py +26 -140
- src/utils/drive_document_processor.py +125 -46
- src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc +0 -0
- src/vectorstores/chroma_vectorstore.py +59 -46
Install your driver.txt
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2. Install your driver
|
2 |
+
Run the following on the command line
|
3 |
+
Note: Use appropriate Python 3 executable
|
4 |
+
python -m pip install "pymongo[srv]"==3.12
|
5 |
+
|
6 |
+
|
7 |
+
View MongoDB Python Driver installation instructions.
|
8 |
+
3. Add your connection string into your application code
|
9 |
+
Use this connection string in your application
|
10 |
+
|
11 |
+
|
12 |
+
View full code sample
|
13 |
+
|
14 |
+
|
15 |
+
Show Password
|
16 |
+
|
17 |
+
mongodb+srv://talat:[email protected]/?retryWrites=true&w=majority&appName=Chatbot
|
18 |
+
|
19 |
+
The password for talat is included in the connection string for your first time setup. This password will not be available again after exiting this connect flow.
|
config/__pycache__/config.cpython-312.pyc
CHANGED
Binary files a/config/__pycache__/config.cpython-312.pyc and b/config/__pycache__/config.cpython-312.pyc differ
|
|
config/config.py
CHANGED
@@ -22,6 +22,8 @@ class Settings:
|
|
22 |
# Anthropic Configuration
|
23 |
ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY', '')
|
24 |
|
|
|
|
|
25 |
# Environment Configuration
|
26 |
ENVIRONMENT = os.getenv('ENVIRONMENT').lower()
|
27 |
|
|
|
22 |
# Anthropic Configuration
|
23 |
ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY', '')
|
24 |
|
25 |
+
# top number of chunks to retrieve.
|
26 |
+
TOP_CHUNKS = int(os.getenv('TOP_CHUNKS', '10'))
|
27 |
# Environment Configuration
|
28 |
ENVIRONMENT = os.getenv('ENVIRONMENT').lower()
|
29 |
|
src/__pycache__/main.cpython-312.pyc
CHANGED
Binary files a/src/__pycache__/main.cpython-312.pyc and b/src/__pycache__/main.cpython-312.pyc differ
|
|
src/agents/__pycache__/rag_agent.cpython-312.pyc
CHANGED
Binary files a/src/agents/__pycache__/rag_agent.cpython-312.pyc and b/src/agents/__pycache__/rag_agent.cpython-312.pyc differ
|
|
src/agents/__pycache__/system_instructions_rag.cpython-312.pyc
CHANGED
Binary files a/src/agents/__pycache__/system_instructions_rag.cpython-312.pyc and b/src/agents/__pycache__/system_instructions_rag.cpython-312.pyc differ
|
|
src/agents/rag_agent.py
CHANGED
@@ -10,6 +10,7 @@ from src.utils.conversation_manager import ConversationManager
|
|
10 |
from src.db.mongodb_store import MongoDBStore
|
11 |
from src.models.rag import RAGResponse
|
12 |
from src.utils.logger import logger
|
|
|
13 |
|
14 |
|
15 |
class RAGAgent(ExcelAwareRAGAgent):
|
@@ -43,6 +44,50 @@ class RAGAgent(ExcelAwareRAGAgent):
|
|
43 |
max_messages=max_history_messages
|
44 |
)
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
async def generate_response(
|
47 |
self,
|
48 |
query: str,
|
@@ -51,9 +96,9 @@ class RAGAgent(ExcelAwareRAGAgent):
|
|
51 |
max_tokens: Optional[int] = None,
|
52 |
context_docs: Optional[List[str]] = None
|
53 |
) -> RAGResponse:
|
54 |
-
"""Generate response with
|
55 |
try:
|
56 |
-
#
|
57 |
is_introduction = (
|
58 |
"wants support" in query and
|
59 |
"This is Introduction" in query and
|
@@ -61,7 +106,6 @@ class RAGAgent(ExcelAwareRAGAgent):
|
|
61 |
)
|
62 |
|
63 |
if is_introduction:
|
64 |
-
# Handle introduction message - no context needed
|
65 |
welcome_message = self._handle_contact_query(query)
|
66 |
return RAGResponse(
|
67 |
response=welcome_message,
|
@@ -77,8 +121,6 @@ class RAGAgent(ExcelAwareRAGAgent):
|
|
77 |
conversation_id,
|
78 |
limit=self.conversation_manager.max_messages
|
79 |
)
|
80 |
-
|
81 |
-
# Get relevant history within token limits
|
82 |
history = self.conversation_manager.get_relevant_history(
|
83 |
messages=history,
|
84 |
current_query=query
|
@@ -94,6 +136,21 @@ class RAGAgent(ExcelAwareRAGAgent):
|
|
94 |
sources = None
|
95 |
scores = None
|
96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
# Check if we have any relevant context
|
98 |
if not context_docs:
|
99 |
return RAGResponse(
|
@@ -103,15 +160,6 @@ class RAGAgent(ExcelAwareRAGAgent):
|
|
103 |
scores=None
|
104 |
)
|
105 |
|
106 |
-
# Check if this is an Excel-related query
|
107 |
-
has_excel_content = any('Sheet:' in doc for doc in context_docs)
|
108 |
-
if has_excel_content:
|
109 |
-
try:
|
110 |
-
context_docs = self._process_excel_context(
|
111 |
-
context_docs, query)
|
112 |
-
except Exception as e:
|
113 |
-
logger.warning(f"Error processing Excel context: {str(e)}")
|
114 |
-
|
115 |
# Generate prompt with context and history
|
116 |
augmented_prompt = self.conversation_manager.generate_prompt_with_history(
|
117 |
current_query=query,
|
@@ -119,7 +167,7 @@ class RAGAgent(ExcelAwareRAGAgent):
|
|
119 |
context_docs=context_docs
|
120 |
)
|
121 |
|
122 |
-
# Generate
|
123 |
response = self.llm.generate(
|
124 |
prompt=augmented_prompt,
|
125 |
temperature=temperature,
|
@@ -129,19 +177,6 @@ class RAGAgent(ExcelAwareRAGAgent):
|
|
129 |
# Clean the response
|
130 |
cleaned_response = self._clean_response(response)
|
131 |
|
132 |
-
# For Excel queries, enhance the response
|
133 |
-
if has_excel_content:
|
134 |
-
try:
|
135 |
-
enhanced_response = await self.enhance_excel_response(
|
136 |
-
query=query,
|
137 |
-
response=cleaned_response,
|
138 |
-
context_docs=context_docs
|
139 |
-
)
|
140 |
-
if enhanced_response:
|
141 |
-
cleaned_response = enhanced_response
|
142 |
-
except Exception as e:
|
143 |
-
logger.warning(f"Error enhancing Excel response: {str(e)}")
|
144 |
-
|
145 |
# Return the final response
|
146 |
return RAGResponse(
|
147 |
response=cleaned_response,
|
@@ -151,7 +186,7 @@ class RAGAgent(ExcelAwareRAGAgent):
|
|
151 |
)
|
152 |
|
153 |
except Exception as e:
|
154 |
-
logger.error(f"Error in
|
155 |
raise
|
156 |
|
157 |
def _create_response_prompt(self, query: str, context_docs: List[str]) -> str:
|
@@ -201,8 +236,7 @@ class RAGAgent(ExcelAwareRAGAgent):
|
|
201 |
async def retrieve_context(
|
202 |
self,
|
203 |
query: str,
|
204 |
-
conversation_history: Optional[List[Dict]] = None
|
205 |
-
top_k: int = 3
|
206 |
) -> Tuple[List[str], List[Dict], Optional[List[float]]]:
|
207 |
"""
|
208 |
Retrieve context with conversation history enhancement
|
@@ -229,7 +263,7 @@ class RAGAgent(ExcelAwareRAGAgent):
|
|
229 |
# Retrieve similar documents
|
230 |
results = self.vector_store.similarity_search(
|
231 |
query_embedding,
|
232 |
-
top_k=
|
233 |
)
|
234 |
|
235 |
# Debug log search results
|
|
|
10 |
from src.db.mongodb_store import MongoDBStore
|
11 |
from src.models.rag import RAGResponse
|
12 |
from src.utils.logger import logger
|
13 |
+
from config.config import settings
|
14 |
|
15 |
|
16 |
class RAGAgent(ExcelAwareRAGAgent):
|
|
|
44 |
max_messages=max_history_messages
|
45 |
)
|
46 |
|
47 |
+
def _extract_markdown_section(self, docs: List[str], section_header: str) -> str:
|
48 |
+
"""Extract complete section content from markdown documents"""
|
49 |
+
combined_text = '\n'.join(docs)
|
50 |
+
|
51 |
+
section_start = combined_text.find(section_header)
|
52 |
+
if section_start == -1:
|
53 |
+
return ""
|
54 |
+
|
55 |
+
next_section = combined_text.find(
|
56 |
+
"\n\n**", section_start + len(section_header))
|
57 |
+
if next_section == -1:
|
58 |
+
section_content = combined_text[section_start:]
|
59 |
+
else:
|
60 |
+
section_content = combined_text[section_start:next_section]
|
61 |
+
|
62 |
+
return self._clean_markdown_content(section_content)
|
63 |
+
|
64 |
+
def _clean_markdown_content(self, content: str) -> str:
|
65 |
+
"""Clean and format markdown content"""
|
66 |
+
lines = content.split('\n')
|
67 |
+
seen_lines = set()
|
68 |
+
cleaned_lines = []
|
69 |
+
|
70 |
+
for line in lines:
|
71 |
+
# Always keep headers and table formatting
|
72 |
+
if '| :----' in line or line.startswith('**'):
|
73 |
+
if line not in seen_lines:
|
74 |
+
cleaned_lines.append(line)
|
75 |
+
seen_lines.add(line)
|
76 |
+
continue
|
77 |
+
|
78 |
+
# Keep table rows and list items
|
79 |
+
if line.strip().startswith('|') or line.strip().startswith('-'):
|
80 |
+
cleaned_lines.append(line)
|
81 |
+
continue
|
82 |
+
|
83 |
+
# Remove duplicates for other content
|
84 |
+
stripped = line.strip()
|
85 |
+
if stripped and stripped not in seen_lines:
|
86 |
+
cleaned_lines.append(line)
|
87 |
+
seen_lines.add(stripped)
|
88 |
+
|
89 |
+
return '\n'.join(cleaned_lines)
|
90 |
+
|
91 |
async def generate_response(
|
92 |
self,
|
93 |
query: str,
|
|
|
96 |
max_tokens: Optional[int] = None,
|
97 |
context_docs: Optional[List[str]] = None
|
98 |
) -> RAGResponse:
|
99 |
+
"""Generate response with improved markdown and conversation handling"""
|
100 |
try:
|
101 |
+
# Handle introduction/welcome message queries
|
102 |
is_introduction = (
|
103 |
"wants support" in query and
|
104 |
"This is Introduction" in query and
|
|
|
106 |
)
|
107 |
|
108 |
if is_introduction:
|
|
|
109 |
welcome_message = self._handle_contact_query(query)
|
110 |
return RAGResponse(
|
111 |
response=welcome_message,
|
|
|
121 |
conversation_id,
|
122 |
limit=self.conversation_manager.max_messages
|
123 |
)
|
|
|
|
|
124 |
history = self.conversation_manager.get_relevant_history(
|
125 |
messages=history,
|
126 |
current_query=query
|
|
|
136 |
sources = None
|
137 |
scores = None
|
138 |
|
139 |
+
# Special handling for markdown section queries
|
140 |
+
if "DISCUSSIONS AND ACTION ITEMS" in query.upper():
|
141 |
+
section_content = self._extract_markdown_section(
|
142 |
+
context_docs,
|
143 |
+
"**DISCUSSIONS AND ACTION ITEMS**"
|
144 |
+
)
|
145 |
+
|
146 |
+
if section_content:
|
147 |
+
return RAGResponse(
|
148 |
+
response=section_content.strip(),
|
149 |
+
context_docs=context_docs,
|
150 |
+
sources=sources,
|
151 |
+
scores=scores
|
152 |
+
)
|
153 |
+
|
154 |
# Check if we have any relevant context
|
155 |
if not context_docs:
|
156 |
return RAGResponse(
|
|
|
160 |
scores=None
|
161 |
)
|
162 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
# Generate prompt with context and history
|
164 |
augmented_prompt = self.conversation_manager.generate_prompt_with_history(
|
165 |
current_query=query,
|
|
|
167 |
context_docs=context_docs
|
168 |
)
|
169 |
|
170 |
+
# Generate response
|
171 |
response = self.llm.generate(
|
172 |
prompt=augmented_prompt,
|
173 |
temperature=temperature,
|
|
|
177 |
# Clean the response
|
178 |
cleaned_response = self._clean_response(response)
|
179 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
# Return the final response
|
181 |
return RAGResponse(
|
182 |
response=cleaned_response,
|
|
|
186 |
)
|
187 |
|
188 |
except Exception as e:
|
189 |
+
logger.error(f"Error in RAGAgent: {str(e)}")
|
190 |
raise
|
191 |
|
192 |
def _create_response_prompt(self, query: str, context_docs: List[str]) -> str:
|
|
|
236 |
async def retrieve_context(
|
237 |
self,
|
238 |
query: str,
|
239 |
+
conversation_history: Optional[List[Dict]] = None
|
|
|
240 |
) -> Tuple[List[str], List[Dict], Optional[List[float]]]:
|
241 |
"""
|
242 |
Retrieve context with conversation history enhancement
|
|
|
263 |
# Retrieve similar documents
|
264 |
results = self.vector_store.similarity_search(
|
265 |
query_embedding,
|
266 |
+
top_k=settings.TOP_CHUNKS
|
267 |
)
|
268 |
|
269 |
# Debug log search results
|
src/db/__pycache__/mongodb_store.cpython-312.pyc
CHANGED
Binary files a/src/db/__pycache__/mongodb_store.cpython-312.pyc and b/src/db/__pycache__/mongodb_store.cpython-312.pyc differ
|
|
src/db/mongodb_store.py
CHANGED
@@ -4,6 +4,7 @@ from datetime import datetime
|
|
4 |
from typing import List, Dict, Optional, Any
|
5 |
from bson import ObjectId
|
6 |
|
|
|
7 |
class MongoDBStore:
|
8 |
def __init__(self, mongo_uri: str = "mongodb://localhost:27017"):
|
9 |
"""Initialize MongoDB connection"""
|
@@ -20,7 +21,8 @@ class MongoDBStore:
|
|
20 |
filename: str,
|
21 |
content_type: str,
|
22 |
file_size: int,
|
23 |
-
url_path: str
|
|
|
24 |
) -> str:
|
25 |
"""Store document metadata in MongoDB"""
|
26 |
document = {
|
@@ -29,9 +31,10 @@ class MongoDBStore:
|
|
29 |
"content_type": content_type,
|
30 |
"file_size": file_size,
|
31 |
"url_path": url_path,
|
|
|
32 |
"upload_timestamp": datetime.now()
|
33 |
}
|
34 |
-
|
35 |
await self.documents.insert_one(document)
|
36 |
return document_id
|
37 |
|
@@ -53,7 +56,8 @@ class MongoDBStore:
|
|
53 |
"content_type": 1,
|
54 |
"file_size": 1,
|
55 |
"url_path": 1,
|
56 |
-
"upload_timestamp": 1
|
|
|
57 |
}
|
58 |
)
|
59 |
return await cursor.to_list(length=None)
|
@@ -62,7 +66,7 @@ class MongoDBStore:
|
|
62 |
"""Delete document from MongoDB"""
|
63 |
result = await self.documents.delete_one({"document_id": document_id})
|
64 |
return result.deleted_count > 0
|
65 |
-
|
66 |
async def find_existing_user(
|
67 |
self,
|
68 |
email: str,
|
@@ -70,11 +74,11 @@ class MongoDBStore:
|
|
70 |
) -> Optional[str]:
|
71 |
"""
|
72 |
Find existing user by email or phone number
|
73 |
-
|
74 |
Args:
|
75 |
email (str): User's email
|
76 |
phone_number (str): User's phone number
|
77 |
-
|
78 |
Returns:
|
79 |
Optional[str]: Conversation ID if found, None otherwise
|
80 |
"""
|
@@ -84,7 +88,7 @@ class MongoDBStore:
|
|
84 |
{"phone_number": phone_number}
|
85 |
]
|
86 |
})
|
87 |
-
|
88 |
return result["conversation_id"] if result else None
|
89 |
|
90 |
# Conversation and chat history methods
|
@@ -105,7 +109,7 @@ class MongoDBStore:
|
|
105 |
full_name (Optional[str]): User's full name
|
106 |
email (Optional[str]): User's email
|
107 |
phone_number (Optional[str]): User's phone number
|
108 |
-
|
109 |
Returns:
|
110 |
str: Conversation ID
|
111 |
"""
|
@@ -124,7 +128,7 @@ class MongoDBStore:
|
|
124 |
conversation["email"] = email
|
125 |
if phone_number:
|
126 |
conversation["phone_number"] = phone_number
|
127 |
-
|
128 |
await self.conversations.insert_one(conversation)
|
129 |
return conversation_id
|
130 |
|
@@ -200,30 +204,31 @@ class MongoDBStore:
|
|
200 |
"rating": None
|
201 |
}
|
202 |
result = await self.chat_history.insert_one(assistant_message)
|
203 |
-
|
204 |
# Update conversation metadata
|
205 |
await self.conversations.update_one(
|
206 |
{"conversation_id": conversation_id},
|
207 |
{
|
208 |
"$set": {"last_updated": datetime.now()},
|
209 |
-
|
|
|
210 |
},
|
211 |
upsert=True
|
212 |
)
|
213 |
-
|
214 |
return str(result.inserted_id)
|
215 |
-
|
216 |
async def get_conversation_history(self, conversation_id: str) -> List[Dict]:
|
217 |
"""Retrieve complete conversation history"""
|
218 |
cursor = self.chat_history.find(
|
219 |
{"conversation_id": conversation_id}
|
220 |
).sort("timestamp", 1)
|
221 |
-
|
222 |
history = []
|
223 |
async for document in cursor:
|
224 |
document["_id"] = str(document["_id"])
|
225 |
history.append(document)
|
226 |
-
|
227 |
return history
|
228 |
|
229 |
async def get_recent_messages(
|
@@ -234,14 +239,15 @@ class MongoDBStore:
|
|
234 |
"""Get most recent messages from conversation"""
|
235 |
cursor = self.chat_history.find(
|
236 |
{"conversation_id": conversation_id}
|
237 |
-
|
238 |
-
|
|
|
239 |
messages = []
|
240 |
async for doc in cursor:
|
241 |
messages.append(self._format_message(doc))
|
242 |
-
|
243 |
return list(reversed(messages))
|
244 |
-
|
245 |
async def update_feedback(
|
246 |
self,
|
247 |
conversation_id: str,
|
@@ -250,20 +256,20 @@ class MongoDBStore:
|
|
250 |
) -> bool:
|
251 |
"""
|
252 |
Update feedback for a conversation
|
253 |
-
|
254 |
Args:
|
255 |
conversation_id (str): Conversation ID
|
256 |
feedback (Optional[str]): Feedback text
|
257 |
rating (Optional[int]): Numeric rating
|
258 |
-
|
259 |
Returns:
|
260 |
bool: True if update successful
|
261 |
"""
|
262 |
update_fields = {}
|
263 |
-
|
264 |
if feedback is not None:
|
265 |
update_fields["feedback"] = feedback
|
266 |
-
|
267 |
if rating is not None:
|
268 |
from config.config import settings
|
269 |
formatted_rating = f"{rating}/{settings.MAX_RATING}"
|
@@ -271,7 +277,7 @@ class MongoDBStore:
|
|
271 |
"rating": rating, # Store numeric value
|
272 |
"formatted_rating": formatted_rating # Store formatted string
|
273 |
})
|
274 |
-
|
275 |
if not update_fields:
|
276 |
return False
|
277 |
|
@@ -279,7 +285,7 @@ class MongoDBStore:
|
|
279 |
{"conversation_id": conversation_id},
|
280 |
{"$set": update_fields}
|
281 |
)
|
282 |
-
|
283 |
# Also update conversation metadata
|
284 |
if result.modified_count > 0:
|
285 |
await self.update_conversation_metadata(
|
@@ -290,7 +296,7 @@ class MongoDBStore:
|
|
290 |
"formatted_rating": formatted_rating if rating is not None else None
|
291 |
}
|
292 |
)
|
293 |
-
|
294 |
return result.modified_count > 0
|
295 |
|
296 |
async def get_messages_for_summary(
|
@@ -301,7 +307,7 @@ class MongoDBStore:
|
|
301 |
cursor = self.chat_history.find(
|
302 |
{"conversation_id": conversation_id}
|
303 |
).sort("timestamp", 1)
|
304 |
-
|
305 |
messages = []
|
306 |
async for doc in cursor:
|
307 |
formatted = self._format_message(doc)
|
@@ -312,10 +318,9 @@ class MongoDBStore:
|
|
312 |
'timestamp': formatted['timestamp'],
|
313 |
'sources': formatted['sources']
|
314 |
})
|
315 |
-
|
316 |
return messages
|
317 |
-
|
318 |
-
|
319 |
def _format_message(self, doc: Dict) -> Dict:
|
320 |
"""Helper method to format message documents consistently"""
|
321 |
return {
|
@@ -330,7 +335,7 @@ class MongoDBStore:
|
|
330 |
"feedback": doc.get("feedback"),
|
331 |
"rating": doc.get("rating")
|
332 |
}
|
333 |
-
|
334 |
# Vector store related methods
|
335 |
async def store_vector_metadata(
|
336 |
self,
|
@@ -345,7 +350,7 @@ class MongoDBStore:
|
|
345 |
"metadata": metadata,
|
346 |
"created_at": datetime.now()
|
347 |
}
|
348 |
-
|
349 |
result = await self.db.vector_metadata.insert_one(vector_metadata)
|
350 |
return str(result.inserted_id)
|
351 |
|
@@ -367,4 +372,4 @@ class MongoDBStore:
|
|
367 |
result = await self.db.vector_metadata.delete_many(
|
368 |
{"document_id": document_id}
|
369 |
)
|
370 |
-
return result.deleted_count > 0
|
|
|
4 |
from typing import List, Dict, Optional, Any
|
5 |
from bson import ObjectId
|
6 |
|
7 |
+
|
8 |
class MongoDBStore:
|
9 |
def __init__(self, mongo_uri: str = "mongodb://localhost:27017"):
|
10 |
"""Initialize MongoDB connection"""
|
|
|
21 |
filename: str,
|
22 |
content_type: str,
|
23 |
file_size: int,
|
24 |
+
url_path: str,
|
25 |
+
source: str
|
26 |
) -> str:
|
27 |
"""Store document metadata in MongoDB"""
|
28 |
document = {
|
|
|
31 |
"content_type": content_type,
|
32 |
"file_size": file_size,
|
33 |
"url_path": url_path,
|
34 |
+
"source": source,
|
35 |
"upload_timestamp": datetime.now()
|
36 |
}
|
37 |
+
|
38 |
await self.documents.insert_one(document)
|
39 |
return document_id
|
40 |
|
|
|
56 |
"content_type": 1,
|
57 |
"file_size": 1,
|
58 |
"url_path": 1,
|
59 |
+
"upload_timestamp": 1,
|
60 |
+
"source": 1
|
61 |
}
|
62 |
)
|
63 |
return await cursor.to_list(length=None)
|
|
|
66 |
"""Delete document from MongoDB"""
|
67 |
result = await self.documents.delete_one({"document_id": document_id})
|
68 |
return result.deleted_count > 0
|
69 |
+
|
70 |
async def find_existing_user(
|
71 |
self,
|
72 |
email: str,
|
|
|
74 |
) -> Optional[str]:
|
75 |
"""
|
76 |
Find existing user by email or phone number
|
77 |
+
|
78 |
Args:
|
79 |
email (str): User's email
|
80 |
phone_number (str): User's phone number
|
81 |
+
|
82 |
Returns:
|
83 |
Optional[str]: Conversation ID if found, None otherwise
|
84 |
"""
|
|
|
88 |
{"phone_number": phone_number}
|
89 |
]
|
90 |
})
|
91 |
+
|
92 |
return result["conversation_id"] if result else None
|
93 |
|
94 |
# Conversation and chat history methods
|
|
|
109 |
full_name (Optional[str]): User's full name
|
110 |
email (Optional[str]): User's email
|
111 |
phone_number (Optional[str]): User's phone number
|
112 |
+
|
113 |
Returns:
|
114 |
str: Conversation ID
|
115 |
"""
|
|
|
128 |
conversation["email"] = email
|
129 |
if phone_number:
|
130 |
conversation["phone_number"] = phone_number
|
131 |
+
|
132 |
await self.conversations.insert_one(conversation)
|
133 |
return conversation_id
|
134 |
|
|
|
204 |
"rating": None
|
205 |
}
|
206 |
result = await self.chat_history.insert_one(assistant_message)
|
207 |
+
|
208 |
# Update conversation metadata
|
209 |
await self.conversations.update_one(
|
210 |
{"conversation_id": conversation_id},
|
211 |
{
|
212 |
"$set": {"last_updated": datetime.now()},
|
213 |
+
# Increment by 2 since we store both messages
|
214 |
+
"$inc": {"message_count": 2}
|
215 |
},
|
216 |
upsert=True
|
217 |
)
|
218 |
+
|
219 |
return str(result.inserted_id)
|
220 |
+
|
221 |
async def get_conversation_history(self, conversation_id: str) -> List[Dict]:
|
222 |
"""Retrieve complete conversation history"""
|
223 |
cursor = self.chat_history.find(
|
224 |
{"conversation_id": conversation_id}
|
225 |
).sort("timestamp", 1)
|
226 |
+
|
227 |
history = []
|
228 |
async for document in cursor:
|
229 |
document["_id"] = str(document["_id"])
|
230 |
history.append(document)
|
231 |
+
|
232 |
return history
|
233 |
|
234 |
async def get_recent_messages(
|
|
|
239 |
"""Get most recent messages from conversation"""
|
240 |
cursor = self.chat_history.find(
|
241 |
{"conversation_id": conversation_id}
|
242 |
+
# Multiply limit by 2 to account for user-assistant pairs
|
243 |
+
).sort("timestamp", -1).limit(limit * 2)
|
244 |
+
|
245 |
messages = []
|
246 |
async for doc in cursor:
|
247 |
messages.append(self._format_message(doc))
|
248 |
+
|
249 |
return list(reversed(messages))
|
250 |
+
|
251 |
async def update_feedback(
|
252 |
self,
|
253 |
conversation_id: str,
|
|
|
256 |
) -> bool:
|
257 |
"""
|
258 |
Update feedback for a conversation
|
259 |
+
|
260 |
Args:
|
261 |
conversation_id (str): Conversation ID
|
262 |
feedback (Optional[str]): Feedback text
|
263 |
rating (Optional[int]): Numeric rating
|
264 |
+
|
265 |
Returns:
|
266 |
bool: True if update successful
|
267 |
"""
|
268 |
update_fields = {}
|
269 |
+
|
270 |
if feedback is not None:
|
271 |
update_fields["feedback"] = feedback
|
272 |
+
|
273 |
if rating is not None:
|
274 |
from config.config import settings
|
275 |
formatted_rating = f"{rating}/{settings.MAX_RATING}"
|
|
|
277 |
"rating": rating, # Store numeric value
|
278 |
"formatted_rating": formatted_rating # Store formatted string
|
279 |
})
|
280 |
+
|
281 |
if not update_fields:
|
282 |
return False
|
283 |
|
|
|
285 |
{"conversation_id": conversation_id},
|
286 |
{"$set": update_fields}
|
287 |
)
|
288 |
+
|
289 |
# Also update conversation metadata
|
290 |
if result.modified_count > 0:
|
291 |
await self.update_conversation_metadata(
|
|
|
296 |
"formatted_rating": formatted_rating if rating is not None else None
|
297 |
}
|
298 |
)
|
299 |
+
|
300 |
return result.modified_count > 0
|
301 |
|
302 |
async def get_messages_for_summary(
|
|
|
307 |
cursor = self.chat_history.find(
|
308 |
{"conversation_id": conversation_id}
|
309 |
).sort("timestamp", 1)
|
310 |
+
|
311 |
messages = []
|
312 |
async for doc in cursor:
|
313 |
formatted = self._format_message(doc)
|
|
|
318 |
'timestamp': formatted['timestamp'],
|
319 |
'sources': formatted['sources']
|
320 |
})
|
321 |
+
|
322 |
return messages
|
323 |
+
|
|
|
324 |
def _format_message(self, doc: Dict) -> Dict:
|
325 |
"""Helper method to format message documents consistently"""
|
326 |
return {
|
|
|
335 |
"feedback": doc.get("feedback"),
|
336 |
"rating": doc.get("rating")
|
337 |
}
|
338 |
+
|
339 |
# Vector store related methods
|
340 |
async def store_vector_metadata(
|
341 |
self,
|
|
|
350 |
"metadata": metadata,
|
351 |
"created_at": datetime.now()
|
352 |
}
|
353 |
+
|
354 |
result = await self.db.vector_metadata.insert_one(vector_metadata)
|
355 |
return str(result.inserted_id)
|
356 |
|
|
|
372 |
result = await self.db.vector_metadata.delete_many(
|
373 |
{"document_id": document_id}
|
374 |
)
|
375 |
+
return result.deleted_count > 0
|
src/implementations/__pycache__/document_service.cpython-312.pyc
CHANGED
Binary files a/src/implementations/__pycache__/document_service.cpython-312.pyc and b/src/implementations/__pycache__/document_service.cpython-312.pyc differ
|
|
src/implementations/document_service.py
CHANGED
@@ -13,9 +13,10 @@ from src.models import DocumentResponse, DocumentInfo, BatchUploadResponse
|
|
13 |
from src.utils.logger import logger
|
14 |
from src.db.mongodb_store import MongoDBStore
|
15 |
|
|
|
16 |
class DocumentService:
|
17 |
def __init__(
|
18 |
-
self,
|
19 |
doc_processor: DocumentProcessor,
|
20 |
mongodb: MongoDBStore
|
21 |
):
|
@@ -27,10 +28,10 @@ class DocumentService:
|
|
27 |
async def check_duplicate_filename(self, filename: str) -> bool:
|
28 |
"""
|
29 |
Check if a file with the same name exists
|
30 |
-
|
31 |
Args:
|
32 |
filename (str): Original filename to check
|
33 |
-
|
34 |
Returns:
|
35 |
bool: True if duplicate exists, False otherwise
|
36 |
"""
|
@@ -45,8 +46,8 @@ class DocumentService:
|
|
45 |
) -> BatchUploadResponse:
|
46 |
"""Process multiple document uploads"""
|
47 |
processed_files, failed_files = await self._handle_file_uploads(
|
48 |
-
files,
|
49 |
-
vector_store,
|
50 |
background_tasks
|
51 |
)
|
52 |
|
@@ -78,22 +79,23 @@ class DocumentService:
|
|
78 |
|
79 |
if not self._is_supported_format(file.filename):
|
80 |
failed_files.append(self._create_failed_file_entry(
|
81 |
-
file.filename,
|
82 |
"Unsupported file format"
|
83 |
))
|
84 |
continue
|
85 |
|
86 |
document_response = await self._process_single_file(
|
87 |
-
file,
|
88 |
-
vector_store,
|
89 |
background_tasks
|
90 |
)
|
91 |
processed_files.append(document_response)
|
92 |
|
93 |
except Exception as e:
|
94 |
-
logger.error(
|
|
|
95 |
failed_files.append(self._create_failed_file_entry(
|
96 |
-
file.filename,
|
97 |
str(e)
|
98 |
))
|
99 |
|
@@ -110,7 +112,7 @@ class DocumentService:
|
|
110 |
filename = f"{document_id}_{file.filename}"
|
111 |
file_path = self.permanent_dir / filename
|
112 |
url_path = f"/docs/{filename}"
|
113 |
-
|
114 |
try:
|
115 |
# Save file to permanent location using a context manager
|
116 |
with open(file_path, "wb") as buffer:
|
@@ -122,12 +124,12 @@ class DocumentService:
|
|
122 |
# Process document with proper cleanup for Excel files
|
123 |
try:
|
124 |
processed_doc = await self.doc_processor.process_document(file_path)
|
125 |
-
|
126 |
# For Excel files, ensure pandas closes the file
|
127 |
if file_path.suffix.lower() in ['.xlsx', '.xls']:
|
128 |
import gc
|
129 |
gc.collect() # Help cleanup any lingering file handles
|
130 |
-
|
131 |
except Exception as proc_error:
|
132 |
logger.error(f"Error processing document: {str(proc_error)}")
|
133 |
raise
|
@@ -138,7 +140,8 @@ class DocumentService:
|
|
138 |
filename=file.filename,
|
139 |
content_type=file.content_type,
|
140 |
file_size=os.path.getsize(file_path),
|
141 |
-
url_path=url_path
|
|
|
142 |
)
|
143 |
|
144 |
# Process for vector store in background
|
@@ -161,21 +164,23 @@ class DocumentService:
|
|
161 |
url_path=url_path
|
162 |
)
|
163 |
)
|
164 |
-
|
165 |
except Exception as e:
|
166 |
# Clean up file if it was created
|
167 |
if file_path.exists():
|
168 |
try:
|
169 |
file_path.unlink()
|
170 |
except Exception as cleanup_error:
|
171 |
-
logger.error(
|
172 |
-
|
|
|
173 |
# Clean up from MongoDB if document was created
|
174 |
try:
|
175 |
await self.mongodb.delete_document(document_id)
|
176 |
except Exception as db_cleanup_error:
|
177 |
-
logger.error(
|
178 |
-
|
|
|
179 |
logger.error(f"Error processing file {file.filename}: {str(e)}")
|
180 |
raise
|
181 |
|
@@ -189,11 +194,12 @@ class DocumentService:
|
|
189 |
"""Process document content for vector store"""
|
190 |
try:
|
191 |
# Generate chunk IDs using document_id
|
192 |
-
chunk_ids = [
|
193 |
-
|
|
|
194 |
# Get embeddings
|
195 |
embeddings = vector_store.embedding_function(chunks)
|
196 |
-
|
197 |
# Prepare metadata for each chunk
|
198 |
metadatas = [{
|
199 |
'document_id': document_id,
|
@@ -201,7 +207,7 @@ class DocumentService:
|
|
201 |
'chunk_index': i,
|
202 |
'total_chunks': len(chunks)
|
203 |
} for i in range(len(chunks))]
|
204 |
-
|
205 |
# Store in vector store
|
206 |
vector_store.add_documents(
|
207 |
documents=chunks,
|
@@ -209,17 +215,19 @@ class DocumentService:
|
|
209 |
metadatas=metadatas,
|
210 |
ids=chunk_ids
|
211 |
)
|
212 |
-
|
213 |
-
logger.info(
|
214 |
-
|
|
|
215 |
except Exception as e:
|
216 |
-
logger.error(
|
|
|
217 |
raise
|
218 |
|
219 |
def _is_supported_format(self, filename: str) -> bool:
|
220 |
"""Check if file format is supported"""
|
221 |
-
return any(filename.lower().endswith(ext)
|
222 |
-
|
223 |
|
224 |
def _create_failed_file_entry(self, filename: str, error: str) -> dict:
|
225 |
"""Create a failed file entry"""
|
@@ -237,15 +245,15 @@ class DocumentService:
|
|
237 |
# Get filename from url_path
|
238 |
filename = doc['url_path'].split('/')[-1]
|
239 |
file_path = self.permanent_dir / filename
|
240 |
-
|
241 |
# Delete physical file if it exists
|
242 |
if file_path.exists():
|
243 |
file_path.unlink()
|
244 |
-
|
245 |
# Delete from MongoDB
|
246 |
return await self.mongodb.delete_document(document_id)
|
247 |
return False
|
248 |
-
|
249 |
except Exception as e:
|
250 |
logger.error(f"Error deleting document: {str(e)}")
|
251 |
raise
|
@@ -253,4 +261,4 @@ class DocumentService:
|
|
253 |
def cleanup(self):
|
254 |
"""Clean up permanent directory if empty"""
|
255 |
if self.permanent_dir.exists() and not any(self.permanent_dir.iterdir()):
|
256 |
-
self.permanent_dir.rmdir()
|
|
|
13 |
from src.utils.logger import logger
|
14 |
from src.db.mongodb_store import MongoDBStore
|
15 |
|
16 |
+
|
17 |
class DocumentService:
|
18 |
def __init__(
|
19 |
+
self,
|
20 |
doc_processor: DocumentProcessor,
|
21 |
mongodb: MongoDBStore
|
22 |
):
|
|
|
28 |
async def check_duplicate_filename(self, filename: str) -> bool:
|
29 |
"""
|
30 |
Check if a file with the same name exists
|
31 |
+
|
32 |
Args:
|
33 |
filename (str): Original filename to check
|
34 |
+
|
35 |
Returns:
|
36 |
bool: True if duplicate exists, False otherwise
|
37 |
"""
|
|
|
46 |
) -> BatchUploadResponse:
|
47 |
"""Process multiple document uploads"""
|
48 |
processed_files, failed_files = await self._handle_file_uploads(
|
49 |
+
files,
|
50 |
+
vector_store,
|
51 |
background_tasks
|
52 |
)
|
53 |
|
|
|
79 |
|
80 |
if not self._is_supported_format(file.filename):
|
81 |
failed_files.append(self._create_failed_file_entry(
|
82 |
+
file.filename,
|
83 |
"Unsupported file format"
|
84 |
))
|
85 |
continue
|
86 |
|
87 |
document_response = await self._process_single_file(
|
88 |
+
file,
|
89 |
+
vector_store,
|
90 |
background_tasks
|
91 |
)
|
92 |
processed_files.append(document_response)
|
93 |
|
94 |
except Exception as e:
|
95 |
+
logger.error(
|
96 |
+
f"Error processing file {file.filename}: {str(e)}")
|
97 |
failed_files.append(self._create_failed_file_entry(
|
98 |
+
file.filename,
|
99 |
str(e)
|
100 |
))
|
101 |
|
|
|
112 |
filename = f"{document_id}_{file.filename}"
|
113 |
file_path = self.permanent_dir / filename
|
114 |
url_path = f"/docs/{filename}"
|
115 |
+
|
116 |
try:
|
117 |
# Save file to permanent location using a context manager
|
118 |
with open(file_path, "wb") as buffer:
|
|
|
124 |
# Process document with proper cleanup for Excel files
|
125 |
try:
|
126 |
processed_doc = await self.doc_processor.process_document(file_path)
|
127 |
+
|
128 |
# For Excel files, ensure pandas closes the file
|
129 |
if file_path.suffix.lower() in ['.xlsx', '.xls']:
|
130 |
import gc
|
131 |
gc.collect() # Help cleanup any lingering file handles
|
132 |
+
|
133 |
except Exception as proc_error:
|
134 |
logger.error(f"Error processing document: {str(proc_error)}")
|
135 |
raise
|
|
|
140 |
filename=file.filename,
|
141 |
content_type=file.content_type,
|
142 |
file_size=os.path.getsize(file_path),
|
143 |
+
url_path=url_path,
|
144 |
+
source="user_upload"
|
145 |
)
|
146 |
|
147 |
# Process for vector store in background
|
|
|
164 |
url_path=url_path
|
165 |
)
|
166 |
)
|
167 |
+
|
168 |
except Exception as e:
|
169 |
# Clean up file if it was created
|
170 |
if file_path.exists():
|
171 |
try:
|
172 |
file_path.unlink()
|
173 |
except Exception as cleanup_error:
|
174 |
+
logger.error(
|
175 |
+
f"Error cleaning up file {file_path}: {str(cleanup_error)}")
|
176 |
+
|
177 |
# Clean up from MongoDB if document was created
|
178 |
try:
|
179 |
await self.mongodb.delete_document(document_id)
|
180 |
except Exception as db_cleanup_error:
|
181 |
+
logger.error(
|
182 |
+
f"Error cleaning up MongoDB document {document_id}: {str(db_cleanup_error)}")
|
183 |
+
|
184 |
logger.error(f"Error processing file {file.filename}: {str(e)}")
|
185 |
raise
|
186 |
|
|
|
194 |
"""Process document content for vector store"""
|
195 |
try:
|
196 |
# Generate chunk IDs using document_id
|
197 |
+
chunk_ids = [
|
198 |
+
f"{document_id}-chunk-{i}" for i in range(len(chunks))]
|
199 |
+
|
200 |
# Get embeddings
|
201 |
embeddings = vector_store.embedding_function(chunks)
|
202 |
+
|
203 |
# Prepare metadata for each chunk
|
204 |
metadatas = [{
|
205 |
'document_id': document_id,
|
|
|
207 |
'chunk_index': i,
|
208 |
'total_chunks': len(chunks)
|
209 |
} for i in range(len(chunks))]
|
210 |
+
|
211 |
# Store in vector store
|
212 |
vector_store.add_documents(
|
213 |
documents=chunks,
|
|
|
215 |
metadatas=metadatas,
|
216 |
ids=chunk_ids
|
217 |
)
|
218 |
+
|
219 |
+
logger.info(
|
220 |
+
f"Successfully processed document {filename} (ID: {document_id}) into {len(chunks)} chunks")
|
221 |
+
|
222 |
except Exception as e:
|
223 |
+
logger.error(
|
224 |
+
f"Error processing document {filename} (ID: {document_id}) for vector store: {str(e)}")
|
225 |
raise
|
226 |
|
227 |
def _is_supported_format(self, filename: str) -> bool:
|
228 |
"""Check if file format is supported"""
|
229 |
+
return any(filename.lower().endswith(ext)
|
230 |
+
for ext in self.doc_processor.supported_formats)
|
231 |
|
232 |
def _create_failed_file_entry(self, filename: str, error: str) -> dict:
|
233 |
"""Create a failed file entry"""
|
|
|
245 |
# Get filename from url_path
|
246 |
filename = doc['url_path'].split('/')[-1]
|
247 |
file_path = self.permanent_dir / filename
|
248 |
+
|
249 |
# Delete physical file if it exists
|
250 |
if file_path.exists():
|
251 |
file_path.unlink()
|
252 |
+
|
253 |
# Delete from MongoDB
|
254 |
return await self.mongodb.delete_document(document_id)
|
255 |
return False
|
256 |
+
|
257 |
except Exception as e:
|
258 |
logger.error(f"Error deleting document: {str(e)}")
|
259 |
raise
|
|
|
261 |
def cleanup(self):
|
262 |
"""Clean up permanent directory if empty"""
|
263 |
if self.permanent_dir.exists() and not any(self.permanent_dir.iterdir()):
|
264 |
+
self.permanent_dir.rmdir()
|
src/main.py
CHANGED
@@ -123,7 +123,8 @@ async def get_all_documents():
|
|
123 |
"content_type": doc.get("content_type"),
|
124 |
"file_size": doc.get("file_size"),
|
125 |
"url_path": doc.get("url_path"),
|
126 |
-
"upload_timestamp": doc.get("upload_timestamp")
|
|
|
127 |
}
|
128 |
formatted_documents.append(formatted_doc)
|
129 |
except Exception as e:
|
@@ -334,7 +335,8 @@ async def process_drive_documents():
|
|
334 |
google_service_account_path=settings.GOOGLE_SERVICE_ACCOUNT_PATH,
|
335 |
folder_id=settings.GOOGLE_DRIVE_FOLDER_ID,
|
336 |
temp_dir=settings.TEMP_DOWNLOAD_DIR,
|
337 |
-
doc_processor=doc_processor
|
|
|
338 |
)
|
339 |
|
340 |
# Process documents
|
|
|
123 |
"content_type": doc.get("content_type"),
|
124 |
"file_size": doc.get("file_size"),
|
125 |
"url_path": doc.get("url_path"),
|
126 |
+
"upload_timestamp": doc.get("upload_timestamp"),
|
127 |
+
"source": doc.get("source")
|
128 |
}
|
129 |
formatted_documents.append(formatted_doc)
|
130 |
except Exception as e:
|
|
|
335 |
google_service_account_path=settings.GOOGLE_SERVICE_ACCOUNT_PATH,
|
336 |
folder_id=settings.GOOGLE_DRIVE_FOLDER_ID,
|
337 |
temp_dir=settings.TEMP_DOWNLOAD_DIR,
|
338 |
+
doc_processor=doc_processor,
|
339 |
+
mongodb=mongodb # Add MongoDB instance
|
340 |
)
|
341 |
|
342 |
# Process documents
|
src/utils/__pycache__/document_processor.cpython-312.pyc
CHANGED
Binary files a/src/utils/__pycache__/document_processor.cpython-312.pyc and b/src/utils/__pycache__/document_processor.cpython-312.pyc differ
|
|
src/utils/__pycache__/drive_document_processor.cpython-312.pyc
CHANGED
Binary files a/src/utils/__pycache__/drive_document_processor.cpython-312.pyc and b/src/utils/__pycache__/drive_document_processor.cpython-312.pyc differ
|
|
src/utils/document_processor.py
CHANGED
@@ -116,124 +116,6 @@ class DocumentProcessor:
|
|
116 |
strip_whitespace=False # Keep whitespace to maintain markdown formatting
|
117 |
)
|
118 |
|
119 |
-
def split_text(self, text: str) -> List[str]:
|
120 |
-
"""Split text with enforced overlap while preserving structure"""
|
121 |
-
try:
|
122 |
-
# Get initial split using RecursiveCharacterTextSplitter
|
123 |
-
initial_chunks = self.text_splitter.split_text(text)
|
124 |
-
if len(initial_chunks) <= 1:
|
125 |
-
return initial_chunks
|
126 |
-
|
127 |
-
# Process chunks with enforced overlap
|
128 |
-
final_chunks = []
|
129 |
-
|
130 |
-
for i, current_chunk in enumerate(initial_chunks):
|
131 |
-
if i == 0:
|
132 |
-
final_chunks.append(current_chunk)
|
133 |
-
continue
|
134 |
-
|
135 |
-
prev_chunk = final_chunks[-1]
|
136 |
-
|
137 |
-
# Get the last part of previous chunk for overlap
|
138 |
-
overlap_size = min(self.chunk_overlap, len(prev_chunk))
|
139 |
-
overlap_text = prev_chunk[-overlap_size:]
|
140 |
-
|
141 |
-
# For tables, include the header row
|
142 |
-
if '|' in current_chunk and '\n' in current_chunk:
|
143 |
-
table_lines = current_chunk.split('\n')
|
144 |
-
header_lines = []
|
145 |
-
for line in table_lines:
|
146 |
-
if line.strip().startswith('|'):
|
147 |
-
header_lines.append(line)
|
148 |
-
else:
|
149 |
-
break
|
150 |
-
if header_lines:
|
151 |
-
header_text = '\n'.join(header_lines) + '\n'
|
152 |
-
overlap_text = header_text + overlap_text
|
153 |
-
|
154 |
-
# Create new chunk with overlap
|
155 |
-
new_chunk = overlap_text + current_chunk
|
156 |
-
|
157 |
-
# Ensure we don't have duplicate content at the overlap point
|
158 |
-
if current_chunk.startswith(overlap_text):
|
159 |
-
new_chunk = current_chunk
|
160 |
-
|
161 |
-
# Add context from previous chunk when needed
|
162 |
-
if not any(marker in new_chunk for marker in ['**AGENDA**', '**DISCUSSIONS**', '| No |']):
|
163 |
-
context_markers = ['**AGENDA**',
|
164 |
-
'**DISCUSSIONS**', '| No |']
|
165 |
-
for marker in context_markers:
|
166 |
-
if marker in prev_chunk and marker not in new_chunk:
|
167 |
-
new_chunk = marker + "\n" + new_chunk
|
168 |
-
break
|
169 |
-
|
170 |
-
final_chunks.append(new_chunk)
|
171 |
-
|
172 |
-
# Validate and log overlaps
|
173 |
-
for i in range(len(final_chunks)-1):
|
174 |
-
actual_overlap = self._find_actual_overlap(
|
175 |
-
final_chunks[i], final_chunks[i+1])
|
176 |
-
logging.debug(
|
177 |
-
f"Overlap between chunks {i} and {i+1}: {len(actual_overlap)} characters")
|
178 |
-
if len(actual_overlap) < self.chunk_overlap:
|
179 |
-
logging.warning(
|
180 |
-
f"Insufficient overlap between chunks {i} and {i+1}")
|
181 |
-
|
182 |
-
return final_chunks
|
183 |
-
|
184 |
-
for start, end in table_sections:
|
185 |
-
# Process text before table if exists
|
186 |
-
if start > current_position:
|
187 |
-
non_table_text = text[current_position:start]
|
188 |
-
if non_table_text.strip():
|
189 |
-
text_chunks = self.text_splitter.split_text(
|
190 |
-
non_table_text)
|
191 |
-
if chunks and text_chunks:
|
192 |
-
# Ensure overlap with previous chunk
|
193 |
-
prev_chunk = chunks[-1]
|
194 |
-
overlap = self._get_overlap_text(prev_chunk)
|
195 |
-
text_chunks[0] = overlap + text_chunks[0]
|
196 |
-
chunks.extend(text_chunks)
|
197 |
-
|
198 |
-
# Process table as a single chunk with overlap
|
199 |
-
table_text = text[start:end]
|
200 |
-
if chunks:
|
201 |
-
prev_chunk = chunks[-1]
|
202 |
-
overlap = self._get_overlap_text(prev_chunk)
|
203 |
-
table_text = overlap + table_text
|
204 |
-
chunks.append(table_text)
|
205 |
-
current_position = end
|
206 |
-
|
207 |
-
# Process remaining text after last table
|
208 |
-
if current_position < len(text):
|
209 |
-
remaining_text = text[current_position:]
|
210 |
-
if remaining_text.strip():
|
211 |
-
text_chunks = self.text_splitter.split_text(remaining_text)
|
212 |
-
if chunks and text_chunks:
|
213 |
-
# Ensure overlap with previous chunk
|
214 |
-
prev_chunk = chunks[-1]
|
215 |
-
overlap = self._get_overlap_text(prev_chunk)
|
216 |
-
text_chunks[0] = overlap + text_chunks[0]
|
217 |
-
chunks.extend(text_chunks)
|
218 |
-
|
219 |
-
# Validate and adjust overlaps
|
220 |
-
chunks = self._ensure_minimum_overlap(chunks)
|
221 |
-
|
222 |
-
# Log chunk details for debugging
|
223 |
-
for i in range(len(chunks)-1):
|
224 |
-
overlap = self._find_actual_overlap(chunks[i], chunks[i+1])
|
225 |
-
logging.debug(
|
226 |
-
f"Overlap between chunks {i} and {i+1}: {len(overlap)} characters")
|
227 |
-
logging.debug(f"End of chunk {i}: {chunks[i][-50:]}")
|
228 |
-
logging.debug(f"Start of chunk {i+1}: {chunks[i+1][:50]}")
|
229 |
-
|
230 |
-
return chunks
|
231 |
-
|
232 |
-
except Exception as e:
|
233 |
-
logging.error(f"Error in split_text: {str(e)}")
|
234 |
-
# Fallback to original text splitter
|
235 |
-
return self.text_splitter.split_text(text)
|
236 |
-
|
237 |
def _find_break_point(self, text: str, prev_chunk: str) -> int:
|
238 |
"""
|
239 |
Find suitable breaking point that maintains document structure
|
@@ -630,38 +512,42 @@ class DocumentProcessor:
|
|
630 |
"""Calculate SHA-256 hash of text"""
|
631 |
return hashlib.sha256(text.encode()).hexdigest()
|
632 |
|
633 |
-
|
634 |
-
"""Process
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
635 |
file_path = Path(file_path)
|
636 |
|
637 |
if not self._validate_file(file_path):
|
638 |
raise ValueError(f"Invalid file: {file_path}")
|
639 |
|
640 |
content = self._extract_content(file_path)
|
641 |
-
|
642 |
-
|
643 |
-
# Try enhanced splitting with validation
|
644 |
-
chunks = self.split_text(content)
|
645 |
-
if not self._validate_chunks(content, chunks):
|
646 |
-
logging.warning(
|
647 |
-
"Enhanced splitting failed validation, falling back to original splitter")
|
648 |
-
chunks = self.text_splitter.split_text(content)
|
649 |
-
|
650 |
-
# Add logging to verify chunk overlap
|
651 |
-
for i in range(len(chunks)-1):
|
652 |
-
logging.debug(f"Chunk {i} ends with: {chunks[i][-50:]}")
|
653 |
-
logging.debug(f"Chunk {i+1} starts with: {chunks[i+1][:50]}")
|
654 |
-
logging.debug(
|
655 |
-
f"Overlap size: {self._calculate_overlap_size(chunks[i], chunks[i+1])} characters")
|
656 |
-
|
657 |
-
chunk_hashes = [self._calculate_hash(chunk) for chunk in chunks]
|
658 |
|
659 |
return {
|
660 |
'content': content,
|
661 |
'chunks': chunks,
|
662 |
-
'
|
663 |
-
'metadata': doc_metadata,
|
664 |
-
'statistics': self._generate_statistics(content, chunks)
|
665 |
}
|
666 |
|
667 |
def _calculate_overlap_size(self, chunk1: str, chunk2: str) -> int:
|
|
|
116 |
strip_whitespace=False # Keep whitespace to maintain markdown formatting
|
117 |
)
|
118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
def _find_break_point(self, text: str, prev_chunk: str) -> int:
|
120 |
"""
|
121 |
Find suitable breaking point that maintains document structure
|
|
|
512 |
"""Calculate SHA-256 hash of text"""
|
513 |
return hashlib.sha256(text.encode()).hexdigest()
|
514 |
|
515 |
+
def _process_chunks(self, text: str) -> List[str]:
|
516 |
+
"""Process text into chunks with proper overlap"""
|
517 |
+
chunks = self.text_splitter.split_text(text)
|
518 |
+
|
519 |
+
# Ensure minimum chunk size and handle overlaps
|
520 |
+
processed_chunks = []
|
521 |
+
for i, chunk in enumerate(chunks):
|
522 |
+
if i > 0:
|
523 |
+
# Add overlap from previous chunk
|
524 |
+
overlap_start = max(
|
525 |
+
0, len(processed_chunks[-1]) - self.chunk_overlap)
|
526 |
+
chunk = processed_chunks[-1][overlap_start:] + chunk
|
527 |
+
|
528 |
+
if len(chunk) > self.chunk_size:
|
529 |
+
# Split oversized chunks
|
530 |
+
sub_chunks = self.text_splitter.split_text(chunk)
|
531 |
+
processed_chunks.extend(sub_chunks)
|
532 |
+
else:
|
533 |
+
processed_chunks.append(chunk)
|
534 |
+
|
535 |
+
return processed_chunks
|
536 |
+
|
537 |
+
async def process_document(self, file_path: Union[str, Path]) -> Dict:
|
538 |
+
"""Process document with chunk overlapping"""
|
539 |
file_path = Path(file_path)
|
540 |
|
541 |
if not self._validate_file(file_path):
|
542 |
raise ValueError(f"Invalid file: {file_path}")
|
543 |
|
544 |
content = self._extract_content(file_path)
|
545 |
+
chunks = self._process_chunks(content)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
546 |
|
547 |
return {
|
548 |
'content': content,
|
549 |
'chunks': chunks,
|
550 |
+
'metadata': self._generate_metadata(file_path, content)
|
|
|
|
|
551 |
}
|
552 |
|
553 |
def _calculate_overlap_size(self, chunk1: str, chunk2: str) -> int:
|
src/utils/drive_document_processor.py
CHANGED
@@ -8,6 +8,8 @@ from src.utils.google_drive_service import GoogleDriveService
|
|
8 |
from src.utils.document_processor import DocumentProcessor
|
9 |
from src.vectorstores.chroma_vectorstore import ChromaVectorStore
|
10 |
from src.utils.logger import logger
|
|
|
|
|
11 |
|
12 |
class DriveDocumentProcessor:
|
13 |
def __init__(
|
@@ -15,38 +17,41 @@ class DriveDocumentProcessor:
|
|
15 |
google_service_account_path: str,
|
16 |
folder_id: str,
|
17 |
temp_dir: str,
|
18 |
-
doc_processor: DocumentProcessor
|
|
|
19 |
):
|
20 |
"""
|
21 |
Initialize Drive Document Processor
|
22 |
-
|
23 |
Args:
|
24 |
google_service_account_path (str): Path to Google service account credentials
|
25 |
folder_id (str): Google Drive folder ID to process
|
26 |
temp_dir (str): Directory for temporary files
|
27 |
doc_processor (DocumentProcessor): Instance of DocumentProcessor
|
28 |
"""
|
29 |
-
self.google_drive_service = GoogleDriveService(
|
|
|
30 |
self.folder_id = folder_id
|
31 |
self.temp_dir = Path(temp_dir)
|
32 |
self.doc_processor = doc_processor
|
33 |
-
|
|
|
34 |
# Create temp directory if it doesn't exist
|
35 |
self.temp_dir.mkdir(exist_ok=True)
|
36 |
-
|
37 |
# Define supported MIME types
|
38 |
self.supported_mime_types = {
|
39 |
# Google Docs
|
40 |
'application/vnd.google-apps.document': '.docx',
|
41 |
-
|
42 |
# Microsoft Word Documents
|
43 |
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
|
44 |
'application/msword': '.doc',
|
45 |
-
|
46 |
# Microsoft Excel Documents
|
47 |
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
|
48 |
'application/vnd.ms-excel': '.xls',
|
49 |
-
|
50 |
# Text Documents
|
51 |
'text/plain': '.txt',
|
52 |
'text/csv': '.csv',
|
@@ -55,7 +60,7 @@ class DriveDocumentProcessor:
|
|
55 |
'text/xml': '.xml',
|
56 |
'application/json': '.json',
|
57 |
'application/rtf': '.rtf',
|
58 |
-
|
59 |
# PDF Documents
|
60 |
'application/pdf': '.pdf'
|
61 |
}
|
@@ -64,18 +69,78 @@ class DriveDocumentProcessor:
|
|
64 |
'application/vnd.google-apps.document': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
65 |
}
|
66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
async def process_documents(
|
68 |
self,
|
69 |
vector_store: ChromaVectorStore,
|
70 |
-
|
|
|
71 |
) -> Dict[str, Any]:
|
72 |
"""
|
73 |
Process all documents in the specified Drive folder
|
74 |
-
|
75 |
Args:
|
76 |
vector_store (ChromaVectorStore): Vector store instance
|
77 |
include_subfolders (bool): Whether to process documents in subfolders
|
78 |
-
|
79 |
Returns:
|
80 |
Dict[str, Any]: Processing results
|
81 |
"""
|
@@ -85,32 +150,35 @@ class DriveDocumentProcessor:
|
|
85 |
self.folder_id,
|
86 |
include_subfolders=include_subfolders
|
87 |
)
|
88 |
-
|
|
|
|
|
|
|
89 |
processed_files = []
|
90 |
skipped_files = []
|
91 |
errors = []
|
92 |
-
|
93 |
for file in files:
|
94 |
# Skip if it's a folder
|
95 |
if file.get('mimeType') == 'application/vnd.google-apps.folder':
|
96 |
continue
|
97 |
-
|
98 |
# Get file path (including folder structure if available)
|
99 |
file_path = self._get_file_path(file)
|
100 |
file['display_path'] = file_path
|
101 |
-
|
102 |
result = await self._process_single_file(file, vector_store)
|
103 |
-
|
104 |
if result['status'] == 'processed':
|
105 |
processed_files.append(result['data'])
|
106 |
elif result['status'] == 'skipped':
|
107 |
skipped_files.append(result['data'])
|
108 |
else: # status == 'error'
|
109 |
errors.append(result['data'])
|
110 |
-
|
111 |
# Clean up temporary directory if empty
|
112 |
self._cleanup_temp_dir()
|
113 |
-
|
114 |
return {
|
115 |
"status": "completed",
|
116 |
"processed_files": {
|
@@ -126,7 +194,7 @@ class DriveDocumentProcessor:
|
|
126 |
"details": errors
|
127 |
}
|
128 |
}
|
129 |
-
|
130 |
except Exception as e:
|
131 |
logger.error(f"Error processing Drive documents: {str(e)}")
|
132 |
raise HTTPException(
|
@@ -137,20 +205,20 @@ class DriveDocumentProcessor:
|
|
137 |
def _get_file_path(self, file: Dict[str, Any]) -> str:
|
138 |
"""
|
139 |
Get the full path for a file including its folder structure
|
140 |
-
|
141 |
Args:
|
142 |
file (Dict[str, Any]): File metadata
|
143 |
-
|
144 |
Returns:
|
145 |
str: Display path of the file
|
146 |
"""
|
147 |
path_parts = [file['name']]
|
148 |
-
|
149 |
# Add folder path if available (new structure)
|
150 |
if folder_path := file.get('folder_path', []):
|
151 |
for folder in reversed(folder_path):
|
152 |
path_parts.insert(0, folder['name'])
|
153 |
-
|
154 |
return '/'.join(path_parts)
|
155 |
|
156 |
async def _process_single_file(
|
@@ -160,7 +228,7 @@ class DriveDocumentProcessor:
|
|
160 |
) -> Dict[str, Any]:
|
161 |
"""Process a single Drive file"""
|
162 |
mime_type = file.get('mimeType', '')
|
163 |
-
|
164 |
# Skip if mime type not supported
|
165 |
if mime_type not in self.supported_mime_types:
|
166 |
return {
|
@@ -171,11 +239,11 @@ class DriveDocumentProcessor:
|
|
171 |
'reason': f'Unsupported mime type: {mime_type}'
|
172 |
}
|
173 |
}
|
174 |
-
|
175 |
try:
|
176 |
document_id = file['id']
|
177 |
modified_time = file.get('modifiedTime', 'N/A')
|
178 |
-
|
179 |
# Check if document should be processed
|
180 |
if self.save_document(document_id, vector_store, modified_time):
|
181 |
# Download and process file
|
@@ -183,13 +251,13 @@ class DriveDocumentProcessor:
|
|
183 |
file['id'],
|
184 |
mime_type
|
185 |
)
|
186 |
-
|
187 |
try:
|
188 |
# Process document
|
189 |
processed_doc = await self.doc_processor.process_document(
|
190 |
str(temp_file_path)
|
191 |
)
|
192 |
-
|
193 |
# Add to vector store with path information
|
194 |
self._add_to_vector_store(
|
195 |
processed_doc['chunks'],
|
@@ -197,7 +265,17 @@ class DriveDocumentProcessor:
|
|
197 |
mime_type,
|
198 |
vector_store
|
199 |
)
|
200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
return {
|
202 |
'status': 'processed',
|
203 |
'data': {
|
@@ -207,7 +285,7 @@ class DriveDocumentProcessor:
|
|
207 |
'chunks_processed': len(processed_doc['chunks'])
|
208 |
}
|
209 |
}
|
210 |
-
|
211 |
finally:
|
212 |
# Clean up temporary file
|
213 |
if temp_file_path.exists():
|
@@ -221,7 +299,7 @@ class DriveDocumentProcessor:
|
|
221 |
'reason': 'Document already exists in the memory.'
|
222 |
}
|
223 |
}
|
224 |
-
|
225 |
except Exception as e:
|
226 |
logger.error(f"Error processing file {file['name']}: {str(e)}")
|
227 |
return {
|
@@ -243,7 +321,7 @@ class DriveDocumentProcessor:
|
|
243 |
"""Add processed chunks to vector store with path information"""
|
244 |
chunk_metadatas = []
|
245 |
chunk_ids = []
|
246 |
-
|
247 |
modified_time = file.get('modifiedTime', 'N/A')
|
248 |
file_path = file.get('display_path', file['name'])
|
249 |
|
@@ -260,7 +338,7 @@ class DriveDocumentProcessor:
|
|
260 |
"file_type": self.supported_mime_types[mime_type],
|
261 |
"is_google_doc": mime_type.startswith('application/vnd.google-apps')
|
262 |
})
|
263 |
-
|
264 |
vector_store.add_documents(
|
265 |
documents=chunks,
|
266 |
metadatas=chunk_metadatas,
|
@@ -275,7 +353,7 @@ class DriveDocumentProcessor:
|
|
275 |
"""Download and save file to temporary location"""
|
276 |
extension = self.supported_mime_types[mime_type]
|
277 |
temp_file_path = self.temp_dir / f"{file_id}{extension}"
|
278 |
-
|
279 |
if mime_type in self.google_docs_export_types:
|
280 |
# Download Google Doc in the specified export format
|
281 |
content = self.google_drive_service.export_file(
|
@@ -285,13 +363,13 @@ class DriveDocumentProcessor:
|
|
285 |
else:
|
286 |
# Download regular file
|
287 |
content = self.google_drive_service.download_file(file_id)
|
288 |
-
|
289 |
with open(temp_file_path, 'wb') as f:
|
290 |
if isinstance(content, str):
|
291 |
f.write(content.encode('utf-8'))
|
292 |
else:
|
293 |
f.write(content)
|
294 |
-
|
295 |
return temp_file_path
|
296 |
|
297 |
def save_document(
|
@@ -302,35 +380,36 @@ class DriveDocumentProcessor:
|
|
302 |
) -> bool:
|
303 |
"""
|
304 |
Check if document needs to be processed based on modification date
|
305 |
-
|
306 |
Args:
|
307 |
document_id (str): ID of the document to check
|
308 |
vector_store (ChromaVectorStore): Vector store instance
|
309 |
modified_date (str): Modified date to compare against
|
310 |
-
|
311 |
Returns:
|
312 |
bool: True if document should be processed, False otherwise
|
313 |
"""
|
314 |
try:
|
315 |
# Retrieve all chunks for the given document_id
|
316 |
chunks = vector_store.get_document_chunks(document_id)
|
317 |
-
|
318 |
if not chunks:
|
319 |
# Document doesn't exist in vector store
|
320 |
return True
|
321 |
-
|
322 |
# Check the modified_time of the first chunk
|
323 |
first_chunk_metadata = chunks[0].get("metadata", {})
|
324 |
-
|
325 |
if first_chunk_metadata.get("modified_time") != modified_date:
|
326 |
# If modified_time doesn't match, delete existing chunks
|
327 |
vector_store.delete_document(document_id)
|
328 |
-
logger.info(
|
|
|
329 |
return True
|
330 |
-
|
331 |
logger.info(f"Document {document_id} is up to date, skipping")
|
332 |
return False
|
333 |
-
|
334 |
except Exception as e:
|
335 |
logger.error(f"Error checking document status: {str(e)}")
|
336 |
# In case of error, process the document to be safe
|
@@ -343,4 +422,4 @@ class DriveDocumentProcessor:
|
|
343 |
self.temp_dir.rmdir()
|
344 |
except Exception as e:
|
345 |
logger.error(f"Error cleaning up temp directory: {str(e)}")
|
346 |
-
# Don't raise the error as this is a cleanup operation
|
|
|
8 |
from src.utils.document_processor import DocumentProcessor
|
9 |
from src.vectorstores.chroma_vectorstore import ChromaVectorStore
|
10 |
from src.utils.logger import logger
|
11 |
+
from src.db.mongodb_store import MongoDBStore
|
12 |
+
|
13 |
|
14 |
class DriveDocumentProcessor:
|
15 |
def __init__(
|
|
|
17 |
google_service_account_path: str,
|
18 |
folder_id: str,
|
19 |
temp_dir: str,
|
20 |
+
doc_processor: DocumentProcessor,
|
21 |
+
mongodb: MongoDBStore # Add MongoDB
|
22 |
):
|
23 |
"""
|
24 |
Initialize Drive Document Processor
|
25 |
+
|
26 |
Args:
|
27 |
google_service_account_path (str): Path to Google service account credentials
|
28 |
folder_id (str): Google Drive folder ID to process
|
29 |
temp_dir (str): Directory for temporary files
|
30 |
doc_processor (DocumentProcessor): Instance of DocumentProcessor
|
31 |
"""
|
32 |
+
self.google_drive_service = GoogleDriveService(
|
33 |
+
google_service_account_path)
|
34 |
self.folder_id = folder_id
|
35 |
self.temp_dir = Path(temp_dir)
|
36 |
self.doc_processor = doc_processor
|
37 |
+
self.mongodb = mongodb # Store MongoDB instance
|
38 |
+
|
39 |
# Create temp directory if it doesn't exist
|
40 |
self.temp_dir.mkdir(exist_ok=True)
|
41 |
+
|
42 |
# Define supported MIME types
|
43 |
self.supported_mime_types = {
|
44 |
# Google Docs
|
45 |
'application/vnd.google-apps.document': '.docx',
|
46 |
+
|
47 |
# Microsoft Word Documents
|
48 |
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
|
49 |
'application/msword': '.doc',
|
50 |
+
|
51 |
# Microsoft Excel Documents
|
52 |
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
|
53 |
'application/vnd.ms-excel': '.xls',
|
54 |
+
|
55 |
# Text Documents
|
56 |
'text/plain': '.txt',
|
57 |
'text/csv': '.csv',
|
|
|
60 |
'text/xml': '.xml',
|
61 |
'application/json': '.json',
|
62 |
'application/rtf': '.rtf',
|
63 |
+
|
64 |
# PDF Documents
|
65 |
'application/pdf': '.pdf'
|
66 |
}
|
|
|
69 |
'application/vnd.google-apps.document': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
70 |
}
|
71 |
|
72 |
+
async def _cleanup_orphaned_documents(
|
73 |
+
self,
|
74 |
+
drive_files: List[Dict[str, Any]],
|
75 |
+
vector_store: ChromaVectorStore
|
76 |
+
) -> Dict[str, Any]:
|
77 |
+
"""
|
78 |
+
Clean up documents that exist in MongoDB but not in Google Drive
|
79 |
+
|
80 |
+
Args:
|
81 |
+
drive_files (List[Dict[str, Any]]): List of files from Google Drive
|
82 |
+
vector_store (ChromaVectorStore): Vector store instance
|
83 |
+
|
84 |
+
Returns:
|
85 |
+
Dict[str, Any]: Cleanup statistics
|
86 |
+
"""
|
87 |
+
try:
|
88 |
+
# Get all documents from MongoDB
|
89 |
+
mongo_docs = await self.mongodb.get_all_documents()
|
90 |
+
|
91 |
+
# Create set of Google Drive file IDs
|
92 |
+
drive_file_ids = {file['id'] for file in drive_files}
|
93 |
+
|
94 |
+
deleted_count = 0
|
95 |
+
failed_deletions = []
|
96 |
+
|
97 |
+
# Check each MongoDB document
|
98 |
+
for doc in mongo_docs:
|
99 |
+
# Only process Google Drive documents
|
100 |
+
if doc.get('source') != 'google_drive':
|
101 |
+
continue
|
102 |
+
|
103 |
+
doc_id = doc.get('document_id')
|
104 |
+
if not doc_id or doc_id not in drive_file_ids:
|
105 |
+
try:
|
106 |
+
# Delete from MongoDB
|
107 |
+
await self.mongodb.delete_document(doc_id)
|
108 |
+
|
109 |
+
# Delete from vector store
|
110 |
+
vector_store.delete_document(doc_id)
|
111 |
+
|
112 |
+
deleted_count += 1
|
113 |
+
|
114 |
+
except Exception as e:
|
115 |
+
logger.error(
|
116 |
+
f"Error deleting orphaned document {doc_id}: {str(e)}")
|
117 |
+
failed_deletions.append({
|
118 |
+
'document_id': doc_id,
|
119 |
+
'error': str(e)
|
120 |
+
})
|
121 |
+
|
122 |
+
return {
|
123 |
+
'orphaned_documents_deleted': deleted_count,
|
124 |
+
'failed_deletions': failed_deletions
|
125 |
+
}
|
126 |
+
|
127 |
+
except Exception as e:
|
128 |
+
logger.error(f"Error in cleanup_orphaned_documents: {str(e)}")
|
129 |
+
raise
|
130 |
+
|
131 |
async def process_documents(
|
132 |
self,
|
133 |
vector_store: ChromaVectorStore,
|
134 |
+
# New parameter with default True for backward compatibility
|
135 |
+
include_subfolders: bool = True
|
136 |
) -> Dict[str, Any]:
|
137 |
"""
|
138 |
Process all documents in the specified Drive folder
|
139 |
+
|
140 |
Args:
|
141 |
vector_store (ChromaVectorStore): Vector store instance
|
142 |
include_subfolders (bool): Whether to process documents in subfolders
|
143 |
+
|
144 |
Returns:
|
145 |
Dict[str, Any]: Processing results
|
146 |
"""
|
|
|
150 |
self.folder_id,
|
151 |
include_subfolders=include_subfolders
|
152 |
)
|
153 |
+
|
154 |
+
# Clean up orphaned documents first
|
155 |
+
cleanup_results = await self._cleanup_orphaned_documents(files, vector_store)
|
156 |
+
|
157 |
processed_files = []
|
158 |
skipped_files = []
|
159 |
errors = []
|
160 |
+
|
161 |
for file in files:
|
162 |
# Skip if it's a folder
|
163 |
if file.get('mimeType') == 'application/vnd.google-apps.folder':
|
164 |
continue
|
165 |
+
|
166 |
# Get file path (including folder structure if available)
|
167 |
file_path = self._get_file_path(file)
|
168 |
file['display_path'] = file_path
|
169 |
+
|
170 |
result = await self._process_single_file(file, vector_store)
|
171 |
+
|
172 |
if result['status'] == 'processed':
|
173 |
processed_files.append(result['data'])
|
174 |
elif result['status'] == 'skipped':
|
175 |
skipped_files.append(result['data'])
|
176 |
else: # status == 'error'
|
177 |
errors.append(result['data'])
|
178 |
+
|
179 |
# Clean up temporary directory if empty
|
180 |
self._cleanup_temp_dir()
|
181 |
+
|
182 |
return {
|
183 |
"status": "completed",
|
184 |
"processed_files": {
|
|
|
194 |
"details": errors
|
195 |
}
|
196 |
}
|
197 |
+
|
198 |
except Exception as e:
|
199 |
logger.error(f"Error processing Drive documents: {str(e)}")
|
200 |
raise HTTPException(
|
|
|
205 |
def _get_file_path(self, file: Dict[str, Any]) -> str:
|
206 |
"""
|
207 |
Get the full path for a file including its folder structure
|
208 |
+
|
209 |
Args:
|
210 |
file (Dict[str, Any]): File metadata
|
211 |
+
|
212 |
Returns:
|
213 |
str: Display path of the file
|
214 |
"""
|
215 |
path_parts = [file['name']]
|
216 |
+
|
217 |
# Add folder path if available (new structure)
|
218 |
if folder_path := file.get('folder_path', []):
|
219 |
for folder in reversed(folder_path):
|
220 |
path_parts.insert(0, folder['name'])
|
221 |
+
|
222 |
return '/'.join(path_parts)
|
223 |
|
224 |
async def _process_single_file(
|
|
|
228 |
) -> Dict[str, Any]:
|
229 |
"""Process a single Drive file"""
|
230 |
mime_type = file.get('mimeType', '')
|
231 |
+
|
232 |
# Skip if mime type not supported
|
233 |
if mime_type not in self.supported_mime_types:
|
234 |
return {
|
|
|
239 |
'reason': f'Unsupported mime type: {mime_type}'
|
240 |
}
|
241 |
}
|
242 |
+
|
243 |
try:
|
244 |
document_id = file['id']
|
245 |
modified_time = file.get('modifiedTime', 'N/A')
|
246 |
+
|
247 |
# Check if document should be processed
|
248 |
if self.save_document(document_id, vector_store, modified_time):
|
249 |
# Download and process file
|
|
|
251 |
file['id'],
|
252 |
mime_type
|
253 |
)
|
254 |
+
|
255 |
try:
|
256 |
# Process document
|
257 |
processed_doc = await self.doc_processor.process_document(
|
258 |
str(temp_file_path)
|
259 |
)
|
260 |
+
|
261 |
# Add to vector store with path information
|
262 |
self._add_to_vector_store(
|
263 |
processed_doc['chunks'],
|
|
|
265 |
mime_type,
|
266 |
vector_store
|
267 |
)
|
268 |
+
|
269 |
+
# Add MongoDB storage - Store Google Drive URL
|
270 |
+
await self.mongodb.store_document(
|
271 |
+
document_id=document_id,
|
272 |
+
filename=file['name'],
|
273 |
+
content_type=mime_type,
|
274 |
+
file_size=0, # Not needed for drive documents
|
275 |
+
url_path=f"https://drive.google.com/file/d/{document_id}/view",
|
276 |
+
source="google_drive"
|
277 |
+
)
|
278 |
+
|
279 |
return {
|
280 |
'status': 'processed',
|
281 |
'data': {
|
|
|
285 |
'chunks_processed': len(processed_doc['chunks'])
|
286 |
}
|
287 |
}
|
288 |
+
|
289 |
finally:
|
290 |
# Clean up temporary file
|
291 |
if temp_file_path.exists():
|
|
|
299 |
'reason': 'Document already exists in the memory.'
|
300 |
}
|
301 |
}
|
302 |
+
|
303 |
except Exception as e:
|
304 |
logger.error(f"Error processing file {file['name']}: {str(e)}")
|
305 |
return {
|
|
|
321 |
"""Add processed chunks to vector store with path information"""
|
322 |
chunk_metadatas = []
|
323 |
chunk_ids = []
|
324 |
+
|
325 |
modified_time = file.get('modifiedTime', 'N/A')
|
326 |
file_path = file.get('display_path', file['name'])
|
327 |
|
|
|
338 |
"file_type": self.supported_mime_types[mime_type],
|
339 |
"is_google_doc": mime_type.startswith('application/vnd.google-apps')
|
340 |
})
|
341 |
+
|
342 |
vector_store.add_documents(
|
343 |
documents=chunks,
|
344 |
metadatas=chunk_metadatas,
|
|
|
353 |
"""Download and save file to temporary location"""
|
354 |
extension = self.supported_mime_types[mime_type]
|
355 |
temp_file_path = self.temp_dir / f"{file_id}{extension}"
|
356 |
+
|
357 |
if mime_type in self.google_docs_export_types:
|
358 |
# Download Google Doc in the specified export format
|
359 |
content = self.google_drive_service.export_file(
|
|
|
363 |
else:
|
364 |
# Download regular file
|
365 |
content = self.google_drive_service.download_file(file_id)
|
366 |
+
|
367 |
with open(temp_file_path, 'wb') as f:
|
368 |
if isinstance(content, str):
|
369 |
f.write(content.encode('utf-8'))
|
370 |
else:
|
371 |
f.write(content)
|
372 |
+
|
373 |
return temp_file_path
|
374 |
|
375 |
def save_document(
|
|
|
380 |
) -> bool:
|
381 |
"""
|
382 |
Check if document needs to be processed based on modification date
|
383 |
+
|
384 |
Args:
|
385 |
document_id (str): ID of the document to check
|
386 |
vector_store (ChromaVectorStore): Vector store instance
|
387 |
modified_date (str): Modified date to compare against
|
388 |
+
|
389 |
Returns:
|
390 |
bool: True if document should be processed, False otherwise
|
391 |
"""
|
392 |
try:
|
393 |
# Retrieve all chunks for the given document_id
|
394 |
chunks = vector_store.get_document_chunks(document_id)
|
395 |
+
|
396 |
if not chunks:
|
397 |
# Document doesn't exist in vector store
|
398 |
return True
|
399 |
+
|
400 |
# Check the modified_time of the first chunk
|
401 |
first_chunk_metadata = chunks[0].get("metadata", {})
|
402 |
+
|
403 |
if first_chunk_metadata.get("modified_time") != modified_date:
|
404 |
# If modified_time doesn't match, delete existing chunks
|
405 |
vector_store.delete_document(document_id)
|
406 |
+
logger.info(
|
407 |
+
f"Document {document_id} has been modified, will reprocess")
|
408 |
return True
|
409 |
+
|
410 |
logger.info(f"Document {document_id} is up to date, skipping")
|
411 |
return False
|
412 |
+
|
413 |
except Exception as e:
|
414 |
logger.error(f"Error checking document status: {str(e)}")
|
415 |
# In case of error, process the document to be safe
|
|
|
422 |
self.temp_dir.rmdir()
|
423 |
except Exception as e:
|
424 |
logger.error(f"Error cleaning up temp directory: {str(e)}")
|
425 |
+
# Don't raise the error as this is a cleanup operation
|
src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc
CHANGED
Binary files a/src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc and b/src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc differ
|
|
src/vectorstores/chroma_vectorstore.py
CHANGED
@@ -97,70 +97,83 @@ class ChromaVectorStore(BaseVectorStore):
|
|
97 |
top_k: int = 3,
|
98 |
**kwargs
|
99 |
) -> List[Dict[str, Any]]:
|
100 |
-
"""
|
101 |
-
Perform similarity search with improved matching
|
102 |
-
"""
|
103 |
try:
|
104 |
-
#
|
105 |
results = self.collection.query(
|
106 |
query_embeddings=[query_embedding],
|
107 |
-
n_results=
|
108 |
include=['documents', 'metadatas', 'distances']
|
109 |
)
|
110 |
|
111 |
-
if not results or 'documents' not in results
|
112 |
-
logging.warning("No results found in similarity search")
|
113 |
return []
|
114 |
|
115 |
formatted_results = []
|
116 |
-
documents = results['documents'][0]
|
117 |
-
metadatas = results['metadatas'][0]
|
118 |
-
|
119 |
-
distances = results['distances'][0] if results.get('distances') else [
|
120 |
-
None] * len(documents)
|
121 |
|
122 |
-
#
|
|
|
123 |
for doc, meta, dist in zip(documents, metadatas, distances):
|
124 |
-
|
125 |
-
|
126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
|
128 |
-
# More permissive threshold and include all results for filtering
|
129 |
-
if similarity_score is not None and similarity_score > 0.2: # Lower threshold
|
130 |
formatted_results.append({
|
131 |
-
'text':
|
132 |
-
'metadata':
|
133 |
-
'score':
|
134 |
})
|
135 |
|
136 |
-
# Sort by score and
|
137 |
-
formatted_results.sort(key=lambda x: x['score']
|
138 |
-
|
139 |
-
# Check if results are from same document and get consecutive chunks
|
140 |
-
if formatted_results:
|
141 |
-
first_doc_id = formatted_results[0]['metadata'].get(
|
142 |
-
'document_id')
|
143 |
-
all_chunks_same_doc = []
|
144 |
-
|
145 |
-
# Get all chunks from the same document
|
146 |
-
for result in formatted_results:
|
147 |
-
if result['metadata'].get('document_id') == first_doc_id:
|
148 |
-
all_chunks_same_doc.append(result)
|
149 |
-
|
150 |
-
# Sort chunks by their index to maintain document flow
|
151 |
-
all_chunks_same_doc.sort(
|
152 |
-
key=lambda x: x['metadata'].get('chunk_index', 0)
|
153 |
-
)
|
154 |
-
|
155 |
-
# Return either all chunks from same document or top_k results
|
156 |
-
if len(all_chunks_same_doc) > 0:
|
157 |
-
return all_chunks_same_doc[:top_k]
|
158 |
-
|
159 |
return formatted_results[:top_k]
|
160 |
|
161 |
except Exception as e:
|
162 |
-
logging.error(
|
163 |
-
f"Error performing similarity search in ChromaDB: {str(e)}")
|
164 |
raise
|
165 |
|
166 |
def get_all_documents(
|
|
|
97 |
top_k: int = 3,
|
98 |
**kwargs
|
99 |
) -> List[Dict[str, Any]]:
|
100 |
+
"""Perform similarity search with improved chunk handling"""
|
|
|
|
|
101 |
try:
|
102 |
+
# Get more initial results to account for sequential chunks
|
103 |
results = self.collection.query(
|
104 |
query_embeddings=[query_embedding],
|
105 |
+
n_results=max(top_k * 2, 10),
|
106 |
include=['documents', 'metadatas', 'distances']
|
107 |
)
|
108 |
|
109 |
+
if not results or 'documents' not in results:
|
|
|
110 |
return []
|
111 |
|
112 |
formatted_results = []
|
113 |
+
documents = results['documents'][0]
|
114 |
+
metadatas = results['metadatas'][0]
|
115 |
+
distances = results['distances'][0]
|
|
|
|
|
116 |
|
117 |
+
# Group chunks by document_id
|
118 |
+
doc_chunks = {}
|
119 |
for doc, meta, dist in zip(documents, metadatas, distances):
|
120 |
+
doc_id = meta.get('document_id')
|
121 |
+
chunk_index = meta.get('chunk_index', 0)
|
122 |
+
|
123 |
+
if doc_id not in doc_chunks:
|
124 |
+
doc_chunks[doc_id] = []
|
125 |
+
|
126 |
+
doc_chunks[doc_id].append({
|
127 |
+
'text': doc,
|
128 |
+
'metadata': meta,
|
129 |
+
'score': 1.0 - dist,
|
130 |
+
'chunk_index': chunk_index
|
131 |
+
})
|
132 |
+
|
133 |
+
# Process each document's chunks
|
134 |
+
for doc_id, chunks in doc_chunks.items():
|
135 |
+
# Sort chunks by index
|
136 |
+
chunks.sort(key=lambda x: x['chunk_index'])
|
137 |
+
|
138 |
+
# Find sequences of chunks with good scores
|
139 |
+
good_sequences = []
|
140 |
+
current_sequence = []
|
141 |
+
|
142 |
+
for chunk in chunks:
|
143 |
+
if chunk['score'] > 0.3: # Adjust threshold as needed
|
144 |
+
if not current_sequence or \
|
145 |
+
chunk['chunk_index'] == current_sequence[-1]['chunk_index'] + 1:
|
146 |
+
current_sequence.append(chunk)
|
147 |
+
else:
|
148 |
+
if current_sequence:
|
149 |
+
good_sequences.append(current_sequence)
|
150 |
+
current_sequence = [chunk]
|
151 |
+
else:
|
152 |
+
if current_sequence:
|
153 |
+
good_sequences.append(current_sequence)
|
154 |
+
current_sequence = []
|
155 |
+
|
156 |
+
if current_sequence:
|
157 |
+
good_sequences.append(current_sequence)
|
158 |
+
|
159 |
+
# Add best sequences to results
|
160 |
+
for sequence in good_sequences:
|
161 |
+
avg_score = sum(c['score']
|
162 |
+
for c in sequence) / len(sequence)
|
163 |
+
combined_text = ' '.join(c['text'] for c in sequence)
|
164 |
|
|
|
|
|
165 |
formatted_results.append({
|
166 |
+
'text': combined_text,
|
167 |
+
'metadata': sequence[0]['metadata'],
|
168 |
+
'score': avg_score
|
169 |
})
|
170 |
|
171 |
+
# Sort by score and return top_k
|
172 |
+
formatted_results.sort(key=lambda x: x['score'], reverse=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
return formatted_results[:top_k]
|
174 |
|
175 |
except Exception as e:
|
176 |
+
logging.error(f"Error in similarity search: {str(e)}")
|
|
|
177 |
raise
|
178 |
|
179 |
def get_all_documents(
|