Spaces:
Running
Changes to be committed:
Browse filesmodified: chroma/5c23c332-257c-4409-8a58-767cdd7c3dea/length.bin
modified: chroma/chroma.sqlite3
modified: config/__pycache__/config.cpython-312.pyc
modified: src/__pycache__/main.cpython-312.pyc
modified: src/agents/__pycache__/rag_agent.cpython-312.pyc
modified: src/agents/__pycache__/system_instructions_rag.cpython-312.pyc
modified: src/agents/rag_agent.py
modified: src/agents/system_instructions_rag.py
modified: src/utils/__pycache__/drive_document_processor.cpython-312.pyc
modified: src/utils/__pycache__/enhanced_excel_processor.cpython-312.pyc
modified: src/utils/__pycache__/google_drive_service.cpython-312.pyc
modified: src/utils/drive_document_processor.py
modified: src/utils/enhanced_excel_processor.py
modified: src/utils/google_drive_service.py
modified: src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc
modified: src/vectorstores/chroma_vectorstore.py
new file: temp_downloads/17he27jN4louYr1xOYASf4BP2e-tGTICt.xlsx
new file: temp_downloads/1K608-Qr03M6nf5FhB6AajbHm8kjQujx1.xlsx
Enhanced the Support for Excel sheets
- config/__pycache__/config.cpython-312.pyc +0 -0
- src/__pycache__/main.cpython-312.pyc +0 -0
- src/agents/__pycache__/rag_agent.cpython-312.pyc +0 -0
- src/agents/__pycache__/system_instructions_rag.cpython-312.pyc +0 -0
- src/agents/rag_agent.py +114 -40
- src/agents/system_instructions_rag.py +22 -5
- src/utils/__pycache__/drive_document_processor.cpython-312.pyc +0 -0
- src/utils/__pycache__/enhanced_excel_processor.cpython-312.pyc +0 -0
- src/utils/__pycache__/google_drive_service.cpython-312.pyc +0 -0
- src/utils/drive_document_processor.py +105 -74
- src/utils/enhanced_excel_processor.py +151 -77
- src/utils/google_drive_service.py +49 -11
- src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc +0 -0
- src/vectorstores/chroma_vectorstore.py +42 -22
- temp_downloads/17he27jN4louYr1xOYASf4BP2e-tGTICt.xlsx +0 -0
- temp_downloads/1K608-Qr03M6nf5FhB6AajbHm8kjQujx1.xlsx +0 -0
Binary files a/config/__pycache__/config.cpython-312.pyc and b/config/__pycache__/config.cpython-312.pyc differ
|
|
Binary files a/src/__pycache__/main.cpython-312.pyc and b/src/__pycache__/main.cpython-312.pyc differ
|
|
Binary files a/src/agents/__pycache__/rag_agent.cpython-312.pyc and b/src/agents/__pycache__/rag_agent.cpython-312.pyc differ
|
|
Binary files a/src/agents/__pycache__/system_instructions_rag.cpython-312.pyc and b/src/agents/__pycache__/system_instructions_rag.cpython-312.pyc differ
|
|
@@ -45,85 +45,156 @@ class RAGAgent(ExcelAwareRAGAgent):
|
|
45 |
async def generate_response(
|
46 |
self,
|
47 |
query: str,
|
48 |
-
conversation_id: Optional[str]
|
49 |
-
temperature: float
|
50 |
max_tokens: Optional[int] = None,
|
51 |
context_docs: Optional[List[str]] = None
|
52 |
) -> RAGResponse:
|
53 |
-
"""Generate
|
54 |
try:
|
55 |
-
#
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
# Get conversation history
|
61 |
-
history = await self.mongodb.get_recent_messages(
|
62 |
-
conversation_id,
|
63 |
-
limit=self.conversation_manager.max_messages
|
64 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
|
72 |
# Retrieve context if not provided
|
73 |
if not context_docs:
|
74 |
context_docs, sources, scores = await self.retrieve_context(
|
75 |
-
query,
|
76 |
-
conversation_history=
|
77 |
)
|
78 |
else:
|
79 |
sources = None
|
80 |
scores = None
|
81 |
|
82 |
-
# Check if
|
83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
if has_excel_content:
|
85 |
try:
|
86 |
context_docs = self._process_excel_context(context_docs, query)
|
87 |
except Exception as e:
|
88 |
logger.warning(f"Error processing Excel context: {str(e)}")
|
89 |
-
# Continue with original context if Excel processing fails
|
90 |
|
91 |
# Generate prompt with context and history
|
92 |
augmented_prompt = self.conversation_manager.generate_prompt_with_history(
|
93 |
current_query=query,
|
94 |
-
history=
|
95 |
context_docs=context_docs
|
96 |
)
|
97 |
|
98 |
-
# Generate initial response
|
99 |
response = self.llm.generate(
|
100 |
-
augmented_prompt,
|
101 |
temperature=temperature,
|
102 |
max_tokens=max_tokens
|
103 |
)
|
104 |
|
105 |
-
#
|
|
|
|
|
|
|
106 |
if has_excel_content:
|
107 |
try:
|
108 |
-
|
109 |
query=query,
|
110 |
-
response=
|
111 |
context_docs=context_docs
|
112 |
)
|
|
|
|
|
113 |
except Exception as e:
|
114 |
logger.warning(f"Error enhancing Excel response: {str(e)}")
|
115 |
-
# Continue with original response if enhancement fails
|
116 |
|
|
|
117 |
return RAGResponse(
|
118 |
-
response=
|
119 |
context_docs=context_docs,
|
120 |
sources=sources,
|
121 |
scores=scores
|
122 |
)
|
123 |
|
124 |
except Exception as e:
|
125 |
-
logger.error(f"Error
|
126 |
raise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
|
128 |
async def retrieve_context(
|
129 |
self,
|
@@ -133,15 +204,6 @@ class RAGAgent(ExcelAwareRAGAgent):
|
|
133 |
) -> Tuple[List[str], List[Dict], Optional[List[float]]]:
|
134 |
"""
|
135 |
Retrieve context with conversation history enhancement
|
136 |
-
|
137 |
-
Args:
|
138 |
-
query (str): Current query
|
139 |
-
conversation_history (Optional[List[Dict]]): Recent conversation history
|
140 |
-
top_k (int): Number of documents to retrieve
|
141 |
-
|
142 |
-
Returns:
|
143 |
-
Tuple[List[str], List[Dict], Optional[List[float]]]:
|
144 |
-
Retrieved documents, sources, and scores
|
145 |
"""
|
146 |
# Enhance query with conversation history
|
147 |
if conversation_history:
|
@@ -153,8 +215,14 @@ class RAGAgent(ExcelAwareRAGAgent):
|
|
153 |
else:
|
154 |
enhanced_query = query
|
155 |
|
|
|
|
|
|
|
156 |
# Embed the enhanced query
|
157 |
query_embedding = self.embedding.embed_query(enhanced_query)
|
|
|
|
|
|
|
158 |
|
159 |
# Retrieve similar documents
|
160 |
results = self.vector_store.similarity_search(
|
@@ -162,6 +230,12 @@ class RAGAgent(ExcelAwareRAGAgent):
|
|
162 |
top_k=top_k
|
163 |
)
|
164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
# Process results
|
166 |
documents = [doc['text'] for doc in results]
|
167 |
sources = [self._convert_metadata_to_strings(doc['metadata'])
|
|
|
45 |
async def generate_response(
|
46 |
self,
|
47 |
query: str,
|
48 |
+
conversation_id: Optional[str],
|
49 |
+
temperature: float,
|
50 |
max_tokens: Optional[int] = None,
|
51 |
context_docs: Optional[List[str]] = None
|
52 |
) -> RAGResponse:
|
53 |
+
"""Generate response with specific handling for different query types"""
|
54 |
try:
|
55 |
+
# First, check if this is an introduction/welcome message query
|
56 |
+
is_introduction = (
|
57 |
+
"wants support" in query and
|
58 |
+
"This is Introduction" in query and
|
59 |
+
("A new user with name:" in query or "An old user with name:" in query)
|
|
|
|
|
|
|
|
|
60 |
)
|
61 |
+
|
62 |
+
if is_introduction:
|
63 |
+
# Handle introduction message - no context needed
|
64 |
+
welcome_message = self._handle_contact_query(query)
|
65 |
+
return RAGResponse(
|
66 |
+
response=welcome_message,
|
67 |
+
context_docs=[],
|
68 |
+
sources=[],
|
69 |
+
scores=None
|
70 |
+
)
|
71 |
+
|
72 |
+
# Get conversation history if conversation_id exists
|
73 |
+
history = []
|
74 |
+
if conversation_id:
|
75 |
+
history = await self.mongodb.get_recent_messages(
|
76 |
+
conversation_id,
|
77 |
+
limit=self.conversation_manager.max_messages
|
78 |
+
)
|
79 |
|
80 |
+
# Get relevant history within token limits
|
81 |
+
history = self.conversation_manager.get_relevant_history(
|
82 |
+
messages=history,
|
83 |
+
current_query=query
|
84 |
+
)
|
85 |
|
86 |
# Retrieve context if not provided
|
87 |
if not context_docs:
|
88 |
context_docs, sources, scores = await self.retrieve_context(
|
89 |
+
query=query,
|
90 |
+
conversation_history=history
|
91 |
)
|
92 |
else:
|
93 |
sources = None
|
94 |
scores = None
|
95 |
|
96 |
+
# Check if we have any relevant context
|
97 |
+
if not context_docs:
|
98 |
+
return RAGResponse(
|
99 |
+
response="Information about this is not available, do you want to inquire about something else?",
|
100 |
+
context_docs=[],
|
101 |
+
sources=[],
|
102 |
+
scores=None
|
103 |
+
)
|
104 |
+
|
105 |
+
# Check if this is an Excel-related query
|
106 |
+
has_excel_content = any('Sheet:' in doc for doc in context_docs)
|
107 |
if has_excel_content:
|
108 |
try:
|
109 |
context_docs = self._process_excel_context(context_docs, query)
|
110 |
except Exception as e:
|
111 |
logger.warning(f"Error processing Excel context: {str(e)}")
|
|
|
112 |
|
113 |
# Generate prompt with context and history
|
114 |
augmented_prompt = self.conversation_manager.generate_prompt_with_history(
|
115 |
current_query=query,
|
116 |
+
history=history,
|
117 |
context_docs=context_docs
|
118 |
)
|
119 |
|
120 |
+
# Generate initial response
|
121 |
response = self.llm.generate(
|
122 |
+
prompt=augmented_prompt,
|
123 |
temperature=temperature,
|
124 |
max_tokens=max_tokens
|
125 |
)
|
126 |
|
127 |
+
# Clean the response
|
128 |
+
cleaned_response = self._clean_response(response)
|
129 |
+
|
130 |
+
# For Excel queries, enhance the response
|
131 |
if has_excel_content:
|
132 |
try:
|
133 |
+
enhanced_response = await self.enhance_excel_response(
|
134 |
query=query,
|
135 |
+
response=cleaned_response,
|
136 |
context_docs=context_docs
|
137 |
)
|
138 |
+
if enhanced_response:
|
139 |
+
cleaned_response = enhanced_response
|
140 |
except Exception as e:
|
141 |
logger.warning(f"Error enhancing Excel response: {str(e)}")
|
|
|
142 |
|
143 |
+
# Return the final response
|
144 |
return RAGResponse(
|
145 |
+
response=cleaned_response,
|
146 |
context_docs=context_docs,
|
147 |
sources=sources,
|
148 |
scores=scores
|
149 |
)
|
150 |
|
151 |
except Exception as e:
|
152 |
+
logger.error(f"Error in SystemInstructionsRAGAgent: {str(e)}")
|
153 |
raise
|
154 |
+
|
155 |
+
def _create_response_prompt(self, query: str, context_docs: List[str]) -> str:
|
156 |
+
"""
|
157 |
+
Create prompt for generating response from context
|
158 |
+
|
159 |
+
Args:
|
160 |
+
query (str): User query
|
161 |
+
context_docs (List[str]): Retrieved context documents
|
162 |
+
|
163 |
+
Returns:
|
164 |
+
str: Formatted prompt for the LLM
|
165 |
+
"""
|
166 |
+
if not context_docs:
|
167 |
+
return f"Query: {query}\nResponse: Information about this is not available, do you want to inquire about something else?"
|
168 |
+
|
169 |
+
# Format context documents
|
170 |
+
formatted_context = "\n\n".join(
|
171 |
+
f"Context {i+1}:\n{doc.strip()}"
|
172 |
+
for i, doc in enumerate(context_docs)
|
173 |
+
if doc and doc.strip()
|
174 |
+
)
|
175 |
+
|
176 |
+
# Build the prompt with detailed instructions
|
177 |
+
prompt = f"""You are a knowledgeable assistant. Use the following context to answer the query accurately and informatively.
|
178 |
+
|
179 |
+
Context Information:
|
180 |
+
{formatted_context}
|
181 |
+
|
182 |
+
Query: {query}
|
183 |
+
|
184 |
+
Instructions:
|
185 |
+
1. Base your response ONLY on the information provided in the context above
|
186 |
+
2. If the context contains numbers, statistics, or specific details, include them in your response
|
187 |
+
3. Keep your response focused and relevant to the query
|
188 |
+
4. Use clear and professional language
|
189 |
+
5. If the context includes technical terms, explain them appropriately
|
190 |
+
6. Do not make assumptions or add information not present in the context
|
191 |
+
7. If specific sections of a report are mentioned, maintain their original structure
|
192 |
+
8. Format the response in a clear, readable manner
|
193 |
+
9. If the context includes chronological information, maintain the proper sequence
|
194 |
+
|
195 |
+
Response:"""
|
196 |
+
|
197 |
+
return prompt
|
198 |
|
199 |
async def retrieve_context(
|
200 |
self,
|
|
|
204 |
) -> Tuple[List[str], List[Dict], Optional[List[float]]]:
|
205 |
"""
|
206 |
Retrieve context with conversation history enhancement
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
"""
|
208 |
# Enhance query with conversation history
|
209 |
if conversation_history:
|
|
|
215 |
else:
|
216 |
enhanced_query = query
|
217 |
|
218 |
+
# Debug log the enhanced query
|
219 |
+
logger.info(f"Enhanced query: {enhanced_query}")
|
220 |
+
|
221 |
# Embed the enhanced query
|
222 |
query_embedding = self.embedding.embed_query(enhanced_query)
|
223 |
+
|
224 |
+
# Debug log embedding shape
|
225 |
+
logger.info(f"Query embedding shape: {len(query_embedding)}")
|
226 |
|
227 |
# Retrieve similar documents
|
228 |
results = self.vector_store.similarity_search(
|
|
|
230 |
top_k=top_k
|
231 |
)
|
232 |
|
233 |
+
# Debug log search results
|
234 |
+
logger.info(f"Number of search results: {len(results)}")
|
235 |
+
for i, result in enumerate(results):
|
236 |
+
logger.info(f"Result {i} score: {result.get('score', 'N/A')}")
|
237 |
+
logger.info(f"Result {i} text preview: {result.get('text', '')[:100]}...")
|
238 |
+
|
239 |
# Process results
|
240 |
documents = [doc['text'] for doc in results]
|
241 |
sources = [self._convert_metadata_to_strings(doc['metadata'])
|
@@ -103,19 +103,36 @@ class SystemInstructionsRAGAgent(RAGAgent):
|
|
103 |
if not context_docs:
|
104 |
return False
|
105 |
|
106 |
-
# Extract key terms from query
|
107 |
query_words = query.lower().split()
|
108 |
-
stop_words = {'
|
|
|
|
|
109 |
query_terms = {word for word in query_words if word not in stop_words}
|
110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
# Check each context document for relevance
|
112 |
for doc in context_docs:
|
113 |
if not doc:
|
114 |
continue
|
115 |
doc_lower = doc.lower()
|
116 |
-
|
117 |
-
|
|
|
|
|
|
|
118 |
return True
|
|
|
119 |
return False
|
120 |
|
121 |
def _create_response_prompt(self, query: str, context_docs: List[str]) -> str:
|
|
|
103 |
if not context_docs:
|
104 |
return False
|
105 |
|
106 |
+
# Extract key terms from query (keeping important words)
|
107 |
query_words = query.lower().split()
|
108 |
+
stop_words = {'me', 'a', 'about', 'what', 'is', 'are', 'the', 'in', 'how', 'why', 'when', 'where'}
|
109 |
+
|
110 |
+
# Remove only basic stop words, keep important terms like "report", "share", etc.
|
111 |
query_terms = {word for word in query_words if word not in stop_words}
|
112 |
+
|
113 |
+
# Add additional relevant terms that might appear in the content
|
114 |
+
related_terms = {
|
115 |
+
'comprehensive',
|
116 |
+
'report',
|
117 |
+
'overview',
|
118 |
+
'summary',
|
119 |
+
'details',
|
120 |
+
'information'
|
121 |
+
}
|
122 |
+
query_terms.update(word for word in query_words if word in related_terms)
|
123 |
+
|
124 |
# Check each context document for relevance
|
125 |
for doc in context_docs:
|
126 |
if not doc:
|
127 |
continue
|
128 |
doc_lower = doc.lower()
|
129 |
+
|
130 |
+
# Consider document relevant if it contains any query terms
|
131 |
+
# or if it starts with common report headers
|
132 |
+
if any(term in doc_lower for term in query_terms) or \
|
133 |
+
any(header in doc_lower for header in ['overview', 'comprehensive report', 'summary']):
|
134 |
return True
|
135 |
+
|
136 |
return False
|
137 |
|
138 |
def _create_response_prompt(self, query: str, context_docs: List[str]) -> str:
|
Binary files a/src/utils/__pycache__/drive_document_processor.cpython-312.pyc and b/src/utils/__pycache__/drive_document_processor.cpython-312.pyc differ
|
|
Binary files a/src/utils/__pycache__/enhanced_excel_processor.cpython-312.pyc and b/src/utils/__pycache__/enhanced_excel_processor.cpython-312.pyc differ
|
|
Binary files a/src/utils/__pycache__/google_drive_service.cpython-312.pyc and b/src/utils/__pycache__/google_drive_service.cpython-312.pyc differ
|
|
@@ -37,7 +37,7 @@ class DriveDocumentProcessor:
|
|
37 |
# Define supported MIME types
|
38 |
self.supported_mime_types = {
|
39 |
# Google Docs
|
40 |
-
'application/vnd.google-apps.document': '.docx',
|
41 |
|
42 |
# Microsoft Word Documents
|
43 |
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
|
@@ -60,33 +60,45 @@ class DriveDocumentProcessor:
|
|
60 |
'application/pdf': '.pdf'
|
61 |
}
|
62 |
|
63 |
-
# Define export MIME types for Google Docs formats
|
64 |
self.google_docs_export_types = {
|
65 |
'application/vnd.google-apps.document': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
66 |
}
|
67 |
|
68 |
async def process_documents(
|
69 |
self,
|
70 |
-
vector_store: ChromaVectorStore
|
|
|
71 |
) -> Dict[str, Any]:
|
72 |
"""
|
73 |
Process all documents in the specified Drive folder
|
74 |
|
75 |
Args:
|
76 |
vector_store (ChromaVectorStore): Vector store instance
|
|
|
77 |
|
78 |
Returns:
|
79 |
Dict[str, Any]: Processing results
|
80 |
"""
|
81 |
try:
|
82 |
# Get documents from folder
|
83 |
-
files = self.google_drive_service.get_folder_contents(
|
|
|
|
|
|
|
84 |
|
85 |
processed_files = []
|
86 |
skipped_files = []
|
87 |
errors = []
|
88 |
|
89 |
for file in files:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
result = await self._process_single_file(file, vector_store)
|
91 |
|
92 |
if result['status'] == 'processed':
|
@@ -122,13 +134,31 @@ class DriveDocumentProcessor:
|
|
122 |
detail=f"Failed to process drive documents: {str(e)}"
|
123 |
)
|
124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
async def _process_single_file(
|
126 |
self,
|
127 |
file: Dict[str, Any],
|
128 |
vector_store: ChromaVectorStore
|
129 |
) -> Dict[str, Any]:
|
130 |
"""Process a single Drive file"""
|
131 |
-
|
132 |
mime_type = file.get('mimeType', '')
|
133 |
|
134 |
# Skip if mime type not supported
|
@@ -137,13 +167,14 @@ class DriveDocumentProcessor:
|
|
137 |
'status': 'skipped',
|
138 |
'data': {
|
139 |
'name': file['name'],
|
|
|
140 |
'reason': f'Unsupported mime type: {mime_type}'
|
141 |
}
|
142 |
}
|
143 |
|
144 |
try:
|
145 |
document_id = file['id']
|
146 |
-
modified_time = file.get('modifiedTime', 'N/A')
|
147 |
|
148 |
# Check if document should be processed
|
149 |
if self.save_document(document_id, vector_store, modified_time):
|
@@ -159,7 +190,7 @@ class DriveDocumentProcessor:
|
|
159 |
str(temp_file_path)
|
160 |
)
|
161 |
|
162 |
-
# Add to vector store
|
163 |
self._add_to_vector_store(
|
164 |
processed_doc['chunks'],
|
165 |
file,
|
@@ -171,6 +202,7 @@ class DriveDocumentProcessor:
|
|
171 |
'status': 'processed',
|
172 |
'data': {
|
173 |
'name': file['name'],
|
|
|
174 |
'id': file['id'],
|
175 |
'chunks_processed': len(processed_doc['chunks'])
|
176 |
}
|
@@ -181,11 +213,11 @@ class DriveDocumentProcessor:
|
|
181 |
if temp_file_path.exists():
|
182 |
temp_file_path.unlink()
|
183 |
else:
|
184 |
-
# Return skipped status if document already exists and is up to date
|
185 |
return {
|
186 |
'status': 'skipped',
|
187 |
'data': {
|
188 |
'name': file['name'],
|
|
|
189 |
'reason': 'Document already exists in the memory.'
|
190 |
}
|
191 |
}
|
@@ -196,46 +228,10 @@ class DriveDocumentProcessor:
|
|
196 |
'status': 'error',
|
197 |
'data': {
|
198 |
'file_name': file['name'],
|
|
|
199 |
'error': str(e)
|
200 |
}
|
201 |
}
|
202 |
-
|
203 |
-
except Exception as e:
|
204 |
-
logger.error(f"Error processing file {file['name']}: {str(e)}")
|
205 |
-
return {
|
206 |
-
'status': 'error',
|
207 |
-
'data': {
|
208 |
-
'file_name': file['name'],
|
209 |
-
'error': str(e)
|
210 |
-
}
|
211 |
-
}
|
212 |
-
|
213 |
-
async def _download_and_save_file(
|
214 |
-
self,
|
215 |
-
file_id: str,
|
216 |
-
mime_type: str
|
217 |
-
) -> Path:
|
218 |
-
"""Download and save file to temporary location"""
|
219 |
-
extension = self.supported_mime_types[mime_type]
|
220 |
-
temp_file_path = self.temp_dir / f"{file_id}{extension}"
|
221 |
-
|
222 |
-
if mime_type in self.google_docs_export_types:
|
223 |
-
# Download Google Doc in the specified export format
|
224 |
-
content = self.google_drive_service.export_file(
|
225 |
-
file_id,
|
226 |
-
self.google_docs_export_types[mime_type]
|
227 |
-
)
|
228 |
-
else:
|
229 |
-
# Download regular file
|
230 |
-
content = self.google_drive_service.download_file(file_id)
|
231 |
-
|
232 |
-
with open(temp_file_path, 'wb') as f:
|
233 |
-
if isinstance(content, str):
|
234 |
-
f.write(content.encode('utf-8'))
|
235 |
-
else:
|
236 |
-
f.write(content)
|
237 |
-
|
238 |
-
return temp_file_path
|
239 |
|
240 |
def _add_to_vector_store(
|
241 |
self,
|
@@ -244,20 +240,18 @@ class DriveDocumentProcessor:
|
|
244 |
mime_type: str,
|
245 |
vector_store: ChromaVectorStore
|
246 |
) -> None:
|
247 |
-
"""Add processed chunks to vector store"""
|
248 |
chunk_metadatas = []
|
249 |
chunk_ids = []
|
250 |
|
251 |
-
|
252 |
-
|
253 |
-
#self.delete_updated_document(document_id, vector_store, modified_time)
|
254 |
-
|
255 |
|
256 |
for i, chunk in enumerate(chunks):
|
257 |
chunk_id = f"{file['id']}-chunk-{i}"
|
258 |
chunk_ids.append(chunk_id)
|
259 |
chunk_metadatas.append({
|
260 |
-
"source":
|
261 |
"document_id": file['id'],
|
262 |
"chunk_index": i,
|
263 |
"mime_type": mime_type,
|
@@ -272,44 +266,81 @@ class DriveDocumentProcessor:
|
|
272 |
metadatas=chunk_metadatas,
|
273 |
ids=chunk_ids
|
274 |
)
|
275 |
-
|
276 |
-
def save_document(self, document_id: str, vector_store: ChromaVectorStore, modified_date: str) -> bool:
|
277 |
-
"""
|
278 |
-
Deletes all chunks of a document if the modified_time does not match the given modified_date.
|
279 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
280 |
Args:
|
281 |
-
document_id (str):
|
282 |
-
vector_store (ChromaVectorStore):
|
283 |
-
modified_date (str):
|
|
|
|
|
|
|
284 |
"""
|
285 |
try:
|
286 |
# Retrieve all chunks for the given document_id
|
287 |
chunks = vector_store.get_document_chunks(document_id)
|
288 |
-
|
289 |
if not chunks:
|
290 |
-
|
291 |
return True
|
292 |
-
|
293 |
# Check the modified_time of the first chunk
|
294 |
first_chunk_metadata = chunks[0].get("metadata", {})
|
295 |
-
|
296 |
if first_chunk_metadata.get("modified_time") != modified_date:
|
297 |
-
# If modified_time doesn't match, delete
|
298 |
vector_store.delete_document(document_id)
|
299 |
-
|
300 |
return True
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
except Exception as e:
|
307 |
-
|
|
|
308 |
return True
|
309 |
|
310 |
-
|
311 |
-
|
312 |
def _cleanup_temp_dir(self) -> None:
|
313 |
"""Clean up temporary directory if empty"""
|
314 |
-
|
315 |
-
self.temp_dir.
|
|
|
|
|
|
|
|
|
|
37 |
# Define supported MIME types
|
38 |
self.supported_mime_types = {
|
39 |
# Google Docs
|
40 |
+
'application/vnd.google-apps.document': '.docx',
|
41 |
|
42 |
# Microsoft Word Documents
|
43 |
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
|
|
|
60 |
'application/pdf': '.pdf'
|
61 |
}
|
62 |
|
|
|
63 |
self.google_docs_export_types = {
|
64 |
'application/vnd.google-apps.document': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
65 |
}
|
66 |
|
67 |
async def process_documents(
|
68 |
self,
|
69 |
+
vector_store: ChromaVectorStore,
|
70 |
+
include_subfolders: bool = True # New parameter with default True for backward compatibility
|
71 |
) -> Dict[str, Any]:
|
72 |
"""
|
73 |
Process all documents in the specified Drive folder
|
74 |
|
75 |
Args:
|
76 |
vector_store (ChromaVectorStore): Vector store instance
|
77 |
+
include_subfolders (bool): Whether to process documents in subfolders
|
78 |
|
79 |
Returns:
|
80 |
Dict[str, Any]: Processing results
|
81 |
"""
|
82 |
try:
|
83 |
# Get documents from folder
|
84 |
+
files = self.google_drive_service.get_folder_contents(
|
85 |
+
self.folder_id,
|
86 |
+
include_subfolders=include_subfolders
|
87 |
+
)
|
88 |
|
89 |
processed_files = []
|
90 |
skipped_files = []
|
91 |
errors = []
|
92 |
|
93 |
for file in files:
|
94 |
+
# Skip if it's a folder
|
95 |
+
if file.get('mimeType') == 'application/vnd.google-apps.folder':
|
96 |
+
continue
|
97 |
+
|
98 |
+
# Get file path (including folder structure if available)
|
99 |
+
file_path = self._get_file_path(file)
|
100 |
+
file['display_path'] = file_path
|
101 |
+
|
102 |
result = await self._process_single_file(file, vector_store)
|
103 |
|
104 |
if result['status'] == 'processed':
|
|
|
134 |
detail=f"Failed to process drive documents: {str(e)}"
|
135 |
)
|
136 |
|
137 |
+
def _get_file_path(self, file: Dict[str, Any]) -> str:
|
138 |
+
"""
|
139 |
+
Get the full path for a file including its folder structure
|
140 |
+
|
141 |
+
Args:
|
142 |
+
file (Dict[str, Any]): File metadata
|
143 |
+
|
144 |
+
Returns:
|
145 |
+
str: Display path of the file
|
146 |
+
"""
|
147 |
+
path_parts = [file['name']]
|
148 |
+
|
149 |
+
# Add folder path if available (new structure)
|
150 |
+
if folder_path := file.get('folder_path', []):
|
151 |
+
for folder in reversed(folder_path):
|
152 |
+
path_parts.insert(0, folder['name'])
|
153 |
+
|
154 |
+
return '/'.join(path_parts)
|
155 |
+
|
156 |
async def _process_single_file(
|
157 |
self,
|
158 |
file: Dict[str, Any],
|
159 |
vector_store: ChromaVectorStore
|
160 |
) -> Dict[str, Any]:
|
161 |
"""Process a single Drive file"""
|
|
|
162 |
mime_type = file.get('mimeType', '')
|
163 |
|
164 |
# Skip if mime type not supported
|
|
|
167 |
'status': 'skipped',
|
168 |
'data': {
|
169 |
'name': file['name'],
|
170 |
+
'path': file.get('display_path', file['name']),
|
171 |
'reason': f'Unsupported mime type: {mime_type}'
|
172 |
}
|
173 |
}
|
174 |
|
175 |
try:
|
176 |
document_id = file['id']
|
177 |
+
modified_time = file.get('modifiedTime', 'N/A')
|
178 |
|
179 |
# Check if document should be processed
|
180 |
if self.save_document(document_id, vector_store, modified_time):
|
|
|
190 |
str(temp_file_path)
|
191 |
)
|
192 |
|
193 |
+
# Add to vector store with path information
|
194 |
self._add_to_vector_store(
|
195 |
processed_doc['chunks'],
|
196 |
file,
|
|
|
202 |
'status': 'processed',
|
203 |
'data': {
|
204 |
'name': file['name'],
|
205 |
+
'path': file.get('display_path', file['name']),
|
206 |
'id': file['id'],
|
207 |
'chunks_processed': len(processed_doc['chunks'])
|
208 |
}
|
|
|
213 |
if temp_file_path.exists():
|
214 |
temp_file_path.unlink()
|
215 |
else:
|
|
|
216 |
return {
|
217 |
'status': 'skipped',
|
218 |
'data': {
|
219 |
'name': file['name'],
|
220 |
+
'path': file.get('display_path', file['name']),
|
221 |
'reason': 'Document already exists in the memory.'
|
222 |
}
|
223 |
}
|
|
|
228 |
'status': 'error',
|
229 |
'data': {
|
230 |
'file_name': file['name'],
|
231 |
+
'path': file.get('display_path', file['name']),
|
232 |
'error': str(e)
|
233 |
}
|
234 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
235 |
|
236 |
def _add_to_vector_store(
|
237 |
self,
|
|
|
240 |
mime_type: str,
|
241 |
vector_store: ChromaVectorStore
|
242 |
) -> None:
|
243 |
+
"""Add processed chunks to vector store with path information"""
|
244 |
chunk_metadatas = []
|
245 |
chunk_ids = []
|
246 |
|
247 |
+
modified_time = file.get('modifiedTime', 'N/A')
|
248 |
+
file_path = file.get('display_path', file['name'])
|
|
|
|
|
249 |
|
250 |
for i, chunk in enumerate(chunks):
|
251 |
chunk_id = f"{file['id']}-chunk-{i}"
|
252 |
chunk_ids.append(chunk_id)
|
253 |
chunk_metadatas.append({
|
254 |
+
"source": file_path, # Use full path instead of just name
|
255 |
"document_id": file['id'],
|
256 |
"chunk_index": i,
|
257 |
"mime_type": mime_type,
|
|
|
266 |
metadatas=chunk_metadatas,
|
267 |
ids=chunk_ids
|
268 |
)
|
|
|
|
|
|
|
|
|
269 |
|
270 |
+
async def _download_and_save_file(
|
271 |
+
self,
|
272 |
+
file_id: str,
|
273 |
+
mime_type: str
|
274 |
+
) -> Path:
|
275 |
+
"""Download and save file to temporary location"""
|
276 |
+
extension = self.supported_mime_types[mime_type]
|
277 |
+
temp_file_path = self.temp_dir / f"{file_id}{extension}"
|
278 |
+
|
279 |
+
if mime_type in self.google_docs_export_types:
|
280 |
+
# Download Google Doc in the specified export format
|
281 |
+
content = self.google_drive_service.export_file(
|
282 |
+
file_id,
|
283 |
+
self.google_docs_export_types[mime_type]
|
284 |
+
)
|
285 |
+
else:
|
286 |
+
# Download regular file
|
287 |
+
content = self.google_drive_service.download_file(file_id)
|
288 |
+
|
289 |
+
with open(temp_file_path, 'wb') as f:
|
290 |
+
if isinstance(content, str):
|
291 |
+
f.write(content.encode('utf-8'))
|
292 |
+
else:
|
293 |
+
f.write(content)
|
294 |
+
|
295 |
+
return temp_file_path
|
296 |
+
|
297 |
+
def save_document(
|
298 |
+
self,
|
299 |
+
document_id: str,
|
300 |
+
vector_store: ChromaVectorStore,
|
301 |
+
modified_date: str
|
302 |
+
) -> bool:
|
303 |
+
"""
|
304 |
+
Check if document needs to be processed based on modification date
|
305 |
+
|
306 |
Args:
|
307 |
+
document_id (str): ID of the document to check
|
308 |
+
vector_store (ChromaVectorStore): Vector store instance
|
309 |
+
modified_date (str): Modified date to compare against
|
310 |
+
|
311 |
+
Returns:
|
312 |
+
bool: True if document should be processed, False otherwise
|
313 |
"""
|
314 |
try:
|
315 |
# Retrieve all chunks for the given document_id
|
316 |
chunks = vector_store.get_document_chunks(document_id)
|
317 |
+
|
318 |
if not chunks:
|
319 |
+
# Document doesn't exist in vector store
|
320 |
return True
|
321 |
+
|
322 |
# Check the modified_time of the first chunk
|
323 |
first_chunk_metadata = chunks[0].get("metadata", {})
|
324 |
+
|
325 |
if first_chunk_metadata.get("modified_time") != modified_date:
|
326 |
+
# If modified_time doesn't match, delete existing chunks
|
327 |
vector_store.delete_document(document_id)
|
328 |
+
logger.info(f"Document {document_id} has been modified, will reprocess")
|
329 |
return True
|
330 |
+
|
331 |
+
logger.info(f"Document {document_id} is up to date, skipping")
|
332 |
+
return False
|
333 |
+
|
|
|
334 |
except Exception as e:
|
335 |
+
logger.error(f"Error checking document status: {str(e)}")
|
336 |
+
# In case of error, process the document to be safe
|
337 |
return True
|
338 |
|
|
|
|
|
339 |
def _cleanup_temp_dir(self) -> None:
|
340 |
"""Clean up temporary directory if empty"""
|
341 |
+
try:
|
342 |
+
if self.temp_dir.exists() and not any(self.temp_dir.iterdir()):
|
343 |
+
self.temp_dir.rmdir()
|
344 |
+
except Exception as e:
|
345 |
+
logger.error(f"Error cleaning up temp directory: {str(e)}")
|
346 |
+
# Don't raise the error as this is a cleanup operation
|
@@ -2,7 +2,9 @@ from typing import Dict, List, Any, Optional
|
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
from pathlib import Path
|
5 |
-
import
|
|
|
|
|
6 |
|
7 |
class EnhancedExcelProcessor:
|
8 |
def __init__(self):
|
@@ -13,7 +15,7 @@ class EnhancedExcelProcessor:
|
|
13 |
|
14 |
def process_excel(self, file_path: Path) -> str:
|
15 |
"""
|
16 |
-
Process Excel file with enhanced
|
17 |
|
18 |
Args:
|
19 |
file_path (Path): Path to Excel file
|
@@ -21,97 +23,146 @@ class EnhancedExcelProcessor:
|
|
21 |
Returns:
|
22 |
str: Structured text representation of Excel content
|
23 |
"""
|
24 |
-
# Read all sheets
|
25 |
excel_file = pd.ExcelFile(file_path)
|
26 |
sheets_data = {}
|
27 |
|
|
|
|
|
|
|
28 |
for sheet_name in excel_file.sheet_names:
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
sheets_data[sheet_name] = df
|
31 |
|
32 |
-
# Generate sheet summary
|
33 |
-
self.sheet_summaries[sheet_name] = self.
|
|
|
|
|
|
|
34 |
|
35 |
-
# Extract sheet metadata
|
36 |
-
self.sheet_metadata[sheet_name] =
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
'date_columns': df.select_dtypes(include=['datetime64']).columns.tolist(),
|
41 |
-
'categorical_columns': df.select_dtypes(include=['object']).columns.tolist()
|
42 |
-
}
|
43 |
|
44 |
# Detect relationships between sheets
|
45 |
self.relationships = self._detect_relationships(sheets_data)
|
46 |
|
47 |
# Generate structured text representation
|
48 |
-
return self.
|
49 |
|
50 |
-
def
|
51 |
-
"""Generate statistical summary for a sheet"""
|
52 |
summary = {
|
53 |
'total_rows': len(df),
|
54 |
'total_columns': len(df.columns),
|
55 |
'column_types': {},
|
56 |
'numeric_summaries': {},
|
57 |
'categorical_summaries': {},
|
58 |
-
'null_counts': df.isnull().sum().to_dict()
|
|
|
|
|
59 |
}
|
60 |
|
61 |
-
# Process numeric columns
|
62 |
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
63 |
for col in numeric_cols:
|
|
|
64 |
summary['numeric_summaries'][col] = {
|
65 |
-
'mean': float(
|
66 |
-
'median': float(
|
67 |
-
'std': float(
|
68 |
-
'min': float(
|
69 |
-
'max': float(
|
|
|
70 |
}
|
71 |
summary['column_types'][col] = 'numeric'
|
72 |
|
73 |
-
# Process categorical columns
|
74 |
categorical_cols = df.select_dtypes(include=['object']).columns
|
75 |
for col in categorical_cols:
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
81 |
summary['column_types'][col] = 'categorical'
|
82 |
|
83 |
return summary
|
84 |
|
85 |
-
def
|
86 |
-
"""
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
|
|
|
|
|
|
112 |
|
113 |
-
def
|
114 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
output_parts = []
|
116 |
|
117 |
# Overall summary
|
@@ -130,36 +181,59 @@ class EnhancedExcelProcessor:
|
|
130 |
# Basic info
|
131 |
output_parts.append(f"Rows: {metadata['rows']}")
|
132 |
output_parts.append(f"Columns: {', '.join(metadata['columns'])}")
|
133 |
-
output_parts.append("")
|
134 |
|
135 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
if metadata['numeric_columns']:
|
137 |
-
output_parts.append("
|
138 |
for col in metadata['numeric_columns']:
|
139 |
stats = summary['numeric_summaries'][col]
|
140 |
output_parts.append(f" {col}:")
|
141 |
output_parts.append(f" Range: {stats['min']} to {stats['max']}")
|
142 |
output_parts.append(f" Average: {stats['mean']:.2f}")
|
143 |
-
|
144 |
|
|
|
145 |
if metadata['categorical_columns']:
|
146 |
-
output_parts.append("
|
147 |
for col in metadata['categorical_columns']:
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
|
157 |
-
# Sample data
|
158 |
-
output_parts.append("
|
159 |
-
|
|
|
160 |
output_parts.append("\n")
|
161 |
|
162 |
-
#
|
163 |
if self.relationships:
|
164 |
output_parts.append("Sheet Relationships:")
|
165 |
for rel_key, rel_info in self.relationships.items():
|
@@ -173,7 +247,7 @@ class EnhancedExcelProcessor:
|
|
173 |
f"{parts[0]}.{parts[2]} and {parts[1]}.{parts[3]}")
|
174 |
|
175 |
return "\n".join(output_parts)
|
176 |
-
|
177 |
def get_sheet_summary(self, sheet_name: str) -> Optional[Dict]:
|
178 |
"""Get summary for a specific sheet"""
|
179 |
return self.sheet_summaries.get(sheet_name)
|
|
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
from pathlib import Path
|
5 |
+
import logging
|
6 |
+
from openpyxl import load_workbook
|
7 |
+
from openpyxl.utils.cell import get_column_letter
|
8 |
|
9 |
class EnhancedExcelProcessor:
|
10 |
def __init__(self):
|
|
|
15 |
|
16 |
def process_excel(self, file_path: Path) -> str:
|
17 |
"""
|
18 |
+
Process Excel file with enhanced data extraction
|
19 |
|
20 |
Args:
|
21 |
file_path (Path): Path to Excel file
|
|
|
23 |
Returns:
|
24 |
str: Structured text representation of Excel content
|
25 |
"""
|
26 |
+
# Read all sheets with improved handling
|
27 |
excel_file = pd.ExcelFile(file_path)
|
28 |
sheets_data = {}
|
29 |
|
30 |
+
# Load workbook for additional metadata
|
31 |
+
workbook = load_workbook(file_path, data_only=True)
|
32 |
+
|
33 |
for sheet_name in excel_file.sheet_names:
|
34 |
+
# Read with pandas for data structure
|
35 |
+
df = pd.read_excel(
|
36 |
+
excel_file,
|
37 |
+
sheet_name=sheet_name,
|
38 |
+
header=None # Read without assuming header to capture all data
|
39 |
+
)
|
40 |
+
|
41 |
+
# Clean column names
|
42 |
+
if df.iloc[0].notna().any(): # If first row has any data
|
43 |
+
df.columns = [f"Column_{i}" if pd.isna(x) else str(x).strip()
|
44 |
+
for i, x in enumerate(df.iloc[0])]
|
45 |
+
df = df.iloc[1:] # Remove header row from data
|
46 |
+
|
47 |
sheets_data[sheet_name] = df
|
48 |
|
49 |
+
# Generate enhanced sheet summary
|
50 |
+
self.sheet_summaries[sheet_name] = self._generate_enhanced_sheet_summary(
|
51 |
+
df,
|
52 |
+
workbook[sheet_name]
|
53 |
+
)
|
54 |
|
55 |
+
# Extract enhanced sheet metadata
|
56 |
+
self.sheet_metadata[sheet_name] = self._extract_enhanced_metadata(
|
57 |
+
df,
|
58 |
+
workbook[sheet_name]
|
59 |
+
)
|
|
|
|
|
|
|
60 |
|
61 |
# Detect relationships between sheets
|
62 |
self.relationships = self._detect_relationships(sheets_data)
|
63 |
|
64 |
# Generate structured text representation
|
65 |
+
return self._generate_enhanced_structured_text(sheets_data, workbook)
|
66 |
|
67 |
+
def _generate_enhanced_sheet_summary(self, df: pd.DataFrame, ws) -> Dict:
|
68 |
+
"""Generate comprehensive statistical summary for a sheet"""
|
69 |
summary = {
|
70 |
'total_rows': len(df),
|
71 |
'total_columns': len(df.columns),
|
72 |
'column_types': {},
|
73 |
'numeric_summaries': {},
|
74 |
'categorical_summaries': {},
|
75 |
+
'null_counts': df.isnull().sum().to_dict(),
|
76 |
+
'merged_cells': self._get_merged_cells_info(ws),
|
77 |
+
'formulas': self._get_formulas_info(ws)
|
78 |
}
|
79 |
|
80 |
+
# Process numeric columns with enhanced detection
|
81 |
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
82 |
for col in numeric_cols:
|
83 |
+
col_data = pd.to_numeric(df[col], errors='coerce')
|
84 |
summary['numeric_summaries'][col] = {
|
85 |
+
'mean': float(col_data.mean()) if not col_data.empty else None,
|
86 |
+
'median': float(col_data.median()) if not col_data.empty else None,
|
87 |
+
'std': float(col_data.std()) if not col_data.empty else None,
|
88 |
+
'min': float(col_data.min()) if not col_data.empty else None,
|
89 |
+
'max': float(col_data.max()) if not col_data.empty else None,
|
90 |
+
'sum': float(col_data.sum()) if not col_data.empty else None
|
91 |
}
|
92 |
summary['column_types'][col] = 'numeric'
|
93 |
|
94 |
+
# Process categorical and text columns with enhanced analysis
|
95 |
categorical_cols = df.select_dtypes(include=['object']).columns
|
96 |
for col in categorical_cols:
|
97 |
+
# Clean and process values
|
98 |
+
values = df[col].astype(str).replace('nan', pd.NA).dropna()
|
99 |
+
if not values.empty:
|
100 |
+
value_counts = values.value_counts()
|
101 |
+
summary['categorical_summaries'][col] = {
|
102 |
+
'unique_values': int(len(value_counts)),
|
103 |
+
'top_values': value_counts.head(5).to_dict(),
|
104 |
+
'contains_currency': self._detect_currency(values),
|
105 |
+
'contains_dates': self._detect_dates(values)
|
106 |
+
}
|
107 |
summary['column_types'][col] = 'categorical'
|
108 |
|
109 |
return summary
|
110 |
|
111 |
+
def _extract_enhanced_metadata(self, df: pd.DataFrame, ws) -> Dict:
|
112 |
+
"""Extract comprehensive metadata including Excel-specific features"""
|
113 |
+
metadata = {
|
114 |
+
'columns': list(df.columns),
|
115 |
+
'rows': len(df),
|
116 |
+
'numeric_columns': df.select_dtypes(include=[np.number]).columns.tolist(),
|
117 |
+
'date_columns': df.select_dtypes(include=['datetime64']).columns.tolist(),
|
118 |
+
'categorical_columns': df.select_dtypes(include=['object']).columns.tolist(),
|
119 |
+
'column_widths': {get_column_letter(i+1): ws.column_dimensions[get_column_letter(i+1)].width
|
120 |
+
for i in range(len(df.columns))
|
121 |
+
if get_column_letter(i+1) in ws.column_dimensions},
|
122 |
+
'hidden_rows': [idx for idx in range(1, ws.max_row + 1) if ws.row_dimensions[idx].hidden],
|
123 |
+
'hidden_columns': [get_column_letter(idx) for idx in range(1, ws.max_column + 1)
|
124 |
+
if ws.column_dimensions[get_column_letter(idx)].hidden],
|
125 |
+
'has_charts': bool(ws._charts),
|
126 |
+
'has_images': bool(ws._images),
|
127 |
+
'frozen_panes': ws.freeze_panes is not None
|
128 |
+
}
|
129 |
+
return metadata
|
130 |
+
|
131 |
+
def _get_merged_cells_info(self, ws) -> List[Dict]:
|
132 |
+
"""Extract information about merged cells"""
|
133 |
+
merged_cells = []
|
134 |
+
for merged_range in ws.merged_cells.ranges:
|
135 |
+
merged_cells.append({
|
136 |
+
'range': str(merged_range),
|
137 |
+
'start_cell': merged_range.start_cell.coordinate,
|
138 |
+
'end_cell': merged_range.end_cell.coordinate
|
139 |
+
})
|
140 |
+
return merged_cells
|
141 |
|
142 |
+
def _get_formulas_info(self, ws) -> Dict[str, str]:
|
143 |
+
"""Extract formulas from the worksheet"""
|
144 |
+
formulas = {}
|
145 |
+
for row in ws.iter_rows():
|
146 |
+
for cell in row:
|
147 |
+
if cell.formula:
|
148 |
+
formulas[cell.coordinate] = cell.formula
|
149 |
+
return formulas
|
150 |
+
|
151 |
+
def _detect_currency(self, series: pd.Series) -> bool:
|
152 |
+
"""Detect if a series contains currency values"""
|
153 |
+
currency_patterns = ['$', '€', '£', '¥']
|
154 |
+
return any(series.astype(str).str.contains('|'.join(currency_patterns)).any())
|
155 |
+
|
156 |
+
def _detect_dates(self, series: pd.Series) -> bool:
|
157 |
+
"""Detect if a series contains date values"""
|
158 |
+
try:
|
159 |
+
pd.to_datetime(series, errors='raise')
|
160 |
+
return True
|
161 |
+
except:
|
162 |
+
return False
|
163 |
+
|
164 |
+
def _generate_enhanced_structured_text(self, sheets_data: Dict[str, pd.DataFrame], workbook) -> str:
|
165 |
+
"""Generate detailed structured text representation of Excel content"""
|
166 |
output_parts = []
|
167 |
|
168 |
# Overall summary
|
|
|
181 |
# Basic info
|
182 |
output_parts.append(f"Rows: {metadata['rows']}")
|
183 |
output_parts.append(f"Columns: {', '.join(metadata['columns'])}")
|
|
|
184 |
|
185 |
+
# Add information about hidden elements
|
186 |
+
if metadata['hidden_rows']:
|
187 |
+
output_parts.append(f"Hidden Rows: {len(metadata['hidden_rows'])}")
|
188 |
+
if metadata['hidden_columns']:
|
189 |
+
output_parts.append(f"Hidden Columns: {len(metadata['hidden_columns'])}")
|
190 |
+
|
191 |
+
# Add information about merged cells
|
192 |
+
if summary['merged_cells']:
|
193 |
+
output_parts.append("\nMerged Cells:")
|
194 |
+
for merge_info in summary['merged_cells'][:5]: # Show first 5 merged ranges
|
195 |
+
output_parts.append(f" - Range: {merge_info['range']}")
|
196 |
+
|
197 |
+
# Numeric columns summary
|
198 |
if metadata['numeric_columns']:
|
199 |
+
output_parts.append("\nNumeric Columns Summary:")
|
200 |
for col in metadata['numeric_columns']:
|
201 |
stats = summary['numeric_summaries'][col]
|
202 |
output_parts.append(f" {col}:")
|
203 |
output_parts.append(f" Range: {stats['min']} to {stats['max']}")
|
204 |
output_parts.append(f" Average: {stats['mean']:.2f}")
|
205 |
+
output_parts.append(f" Sum: {stats['sum']:.2f}")
|
206 |
|
207 |
+
# Categorical columns summary
|
208 |
if metadata['categorical_columns']:
|
209 |
+
output_parts.append("\nCategorical Columns Summary:")
|
210 |
for col in metadata['categorical_columns']:
|
211 |
+
if col in summary['categorical_summaries']:
|
212 |
+
cats = summary['categorical_summaries'][col]
|
213 |
+
output_parts.append(f" {col}:")
|
214 |
+
output_parts.append(f" Unique Values: {cats['unique_values']}")
|
215 |
+
if cats['top_values']:
|
216 |
+
output_parts.append(" Top Values: " +
|
217 |
+
", ".join(f"{k} ({v})" for k, v in
|
218 |
+
list(cats['top_values'].items())[:3]))
|
219 |
+
if cats['contains_currency']:
|
220 |
+
output_parts.append(" Contains Currency Values")
|
221 |
+
if cats['contains_dates']:
|
222 |
+
output_parts.append(" Contains Date Values")
|
223 |
+
|
224 |
+
# Add formula information
|
225 |
+
if summary['formulas']:
|
226 |
+
output_parts.append("\nFormulas Present:")
|
227 |
+
for cell, formula in list(summary['formulas'].items())[:5]: # Show first 5 formulas
|
228 |
+
output_parts.append(f" {cell}: {formula}")
|
229 |
|
230 |
+
# Sample data with improved formatting
|
231 |
+
output_parts.append("\nSample Data:")
|
232 |
+
sample_data = df.head(5).fillna("").to_string(index=False)
|
233 |
+
output_parts.append(sample_data)
|
234 |
output_parts.append("\n")
|
235 |
|
236 |
+
# Sheet relationships
|
237 |
if self.relationships:
|
238 |
output_parts.append("Sheet Relationships:")
|
239 |
for rel_key, rel_info in self.relationships.items():
|
|
|
247 |
f"{parts[0]}.{parts[2]} and {parts[1]}.{parts[3]}")
|
248 |
|
249 |
return "\n".join(output_parts)
|
250 |
+
|
251 |
def get_sheet_summary(self, sheet_name: str) -> Optional[Dict]:
|
252 |
"""Get summary for a specific sheet"""
|
253 |
return self.sheet_summaries.get(sheet_name)
|
@@ -3,7 +3,8 @@ from google.oauth2 import service_account
|
|
3 |
from googleapiclient.discovery import build
|
4 |
from googleapiclient.http import MediaIoBaseDownload
|
5 |
import io
|
6 |
-
import
|
|
|
7 |
|
8 |
class GoogleDriveService:
|
9 |
def __init__(self, credentials_path: str):
|
@@ -19,24 +20,61 @@ class GoogleDriveService:
|
|
19 |
)
|
20 |
self.service = build('drive', 'v3', credentials=self.credentials)
|
21 |
|
22 |
-
def get_folder_contents(self, folder_id: str):
|
23 |
"""
|
24 |
-
Get contents of a Drive folder
|
25 |
|
26 |
Args:
|
27 |
folder_id (str): ID of the folder to process
|
|
|
28 |
|
29 |
Returns:
|
30 |
List[Dict]: List of file metadata
|
31 |
"""
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
def download_file(self, file_id: str) -> bytes:
|
42 |
"""
|
|
|
3 |
from googleapiclient.discovery import build
|
4 |
from googleapiclient.http import MediaIoBaseDownload
|
5 |
import io
|
6 |
+
from typing import List, Dict, Any
|
7 |
+
import logging
|
8 |
|
9 |
class GoogleDriveService:
|
10 |
def __init__(self, credentials_path: str):
|
|
|
20 |
)
|
21 |
self.service = build('drive', 'v3', credentials=self.credentials)
|
22 |
|
23 |
+
def get_folder_contents(self, folder_id: str, include_subfolders: bool = False) -> List[Dict[str, Any]]:
|
24 |
"""
|
25 |
+
Get contents of a Drive folder including subfolders if specified
|
26 |
|
27 |
Args:
|
28 |
folder_id (str): ID of the folder to process
|
29 |
+
include_subfolders (bool): Whether to include contents of subfolders (default: False)
|
30 |
|
31 |
Returns:
|
32 |
List[Dict]: List of file metadata
|
33 |
"""
|
34 |
+
all_files = []
|
35 |
+
try:
|
36 |
+
# Get all items in the current folder
|
37 |
+
query = f"'{folder_id}' in parents and trashed=false"
|
38 |
+
results = self.service.files().list(
|
39 |
+
q=query,
|
40 |
+
fields="files(id, name, mimeType, modifiedTime, parents)",
|
41 |
+
supportsAllDrives=True,
|
42 |
+
includeItemsFromAllDrives=True
|
43 |
+
).execute()
|
44 |
+
|
45 |
+
items = results.get('files', [])
|
46 |
+
|
47 |
+
for item in items:
|
48 |
+
if item['mimeType'] == 'application/vnd.google-apps.folder' and include_subfolders:
|
49 |
+
# Recursively get contents of subfolder
|
50 |
+
try:
|
51 |
+
subfolder_files = self.get_folder_contents(
|
52 |
+
item['id'],
|
53 |
+
include_subfolders=True
|
54 |
+
)
|
55 |
+
# Add folder path information to each file
|
56 |
+
for file in subfolder_files:
|
57 |
+
if not file.get('folder_path'):
|
58 |
+
file['folder_path'] = []
|
59 |
+
file['folder_path'].insert(0, {
|
60 |
+
'id': item['id'],
|
61 |
+
'name': item['name']
|
62 |
+
})
|
63 |
+
all_files.extend(subfolder_files)
|
64 |
+
except Exception as e:
|
65 |
+
logging.error(f"Error processing subfolder {item['name']}: {str(e)}")
|
66 |
+
continue
|
67 |
+
else:
|
68 |
+
# For backward compatibility, maintain original structure
|
69 |
+
# but add folder path information
|
70 |
+
item['folder_path'] = []
|
71 |
+
all_files.append(item)
|
72 |
+
|
73 |
+
return all_files
|
74 |
+
|
75 |
+
except Exception as e:
|
76 |
+
logging.error(f"Error getting folder contents for folder {folder_id}: {str(e)}")
|
77 |
+
return [] # Return empty list for backward compatibility
|
78 |
|
79 |
def download_file(self, file_id: str) -> bytes:
|
80 |
"""
|
Binary files a/src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc and b/src/vectorstores/__pycache__/chroma_vectorstore.cpython-312.pyc differ
|
|
@@ -93,42 +93,62 @@ class ChromaVectorStore(BaseVectorStore):
|
|
93 |
**kwargs
|
94 |
) -> List[Dict[str, Any]]:
|
95 |
"""
|
96 |
-
Perform similarity search
|
97 |
-
|
98 |
-
Args:
|
99 |
-
query_embedding (List[float]): Embedding of the query
|
100 |
-
top_k (int): Number of top similar documents to retrieve
|
101 |
-
**kwargs: Additional search parameters
|
102 |
-
|
103 |
-
Returns:
|
104 |
-
List[Dict[str, Any]]: List of documents with their text, metadata, and scores
|
105 |
"""
|
106 |
try:
|
|
|
107 |
results = self.collection.query(
|
108 |
query_embeddings=[query_embedding],
|
109 |
-
n_results=
|
110 |
include=['documents', 'metadatas', 'distances']
|
111 |
)
|
112 |
-
|
113 |
-
# Handle the case where no results are found
|
114 |
if not results or 'documents' not in results or not results['documents']:
|
|
|
115 |
return []
|
116 |
-
|
117 |
-
# Format results to include text, metadata, and scores
|
118 |
formatted_results = []
|
119 |
documents = results['documents'][0] # First query's results
|
120 |
metadatas = results['metadatas'][0] if results.get('metadatas') else [None] * len(documents)
|
121 |
distances = results['distances'][0] if results.get('distances') else [None] * len(documents)
|
122 |
-
|
|
|
123 |
for doc, meta, dist in zip(documents, metadatas, distances):
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
|
130 |
-
return formatted_results
|
131 |
-
|
132 |
except Exception as e:
|
133 |
logging.error(f"Error performing similarity search in ChromaDB: {str(e)}")
|
134 |
raise
|
|
|
93 |
**kwargs
|
94 |
) -> List[Dict[str, Any]]:
|
95 |
"""
|
96 |
+
Perform similarity search with improved matching
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
"""
|
98 |
try:
|
99 |
+
# Increase n_results to get more potential matches
|
100 |
results = self.collection.query(
|
101 |
query_embeddings=[query_embedding],
|
102 |
+
n_results=10, # Get more initial results
|
103 |
include=['documents', 'metadatas', 'distances']
|
104 |
)
|
105 |
+
|
|
|
106 |
if not results or 'documents' not in results or not results['documents']:
|
107 |
+
logging.warning("No results found in similarity search")
|
108 |
return []
|
109 |
+
|
|
|
110 |
formatted_results = []
|
111 |
documents = results['documents'][0] # First query's results
|
112 |
metadatas = results['metadatas'][0] if results.get('metadatas') else [None] * len(documents)
|
113 |
distances = results['distances'][0] if results.get('distances') else [None] * len(documents)
|
114 |
+
|
115 |
+
# Process all results
|
116 |
for doc, meta, dist in zip(documents, metadatas, distances):
|
117 |
+
# Convert distance to similarity score (1 is most similar, 0 is least)
|
118 |
+
similarity_score = 1.0 - (dist or 0.0) if dist is not None else None
|
119 |
+
|
120 |
+
# More permissive threshold and include all results for filtering
|
121 |
+
if similarity_score is not None and similarity_score > 0.2: # Lower threshold
|
122 |
+
formatted_results.append({
|
123 |
+
'text': doc,
|
124 |
+
'metadata': meta or {},
|
125 |
+
'score': similarity_score
|
126 |
+
})
|
127 |
+
|
128 |
+
# Sort by score and get top_k results
|
129 |
+
formatted_results.sort(key=lambda x: x['score'] or 0, reverse=True)
|
130 |
+
|
131 |
+
# Check if results are from same document and get consecutive chunks
|
132 |
+
if formatted_results:
|
133 |
+
first_doc_id = formatted_results[0]['metadata'].get('document_id')
|
134 |
+
all_chunks_same_doc = []
|
135 |
+
|
136 |
+
# Get all chunks from the same document
|
137 |
+
for result in formatted_results:
|
138 |
+
if result['metadata'].get('document_id') == first_doc_id:
|
139 |
+
all_chunks_same_doc.append(result)
|
140 |
+
|
141 |
+
# Sort chunks by their index to maintain document flow
|
142 |
+
all_chunks_same_doc.sort(
|
143 |
+
key=lambda x: x['metadata'].get('chunk_index', 0)
|
144 |
+
)
|
145 |
+
|
146 |
+
# Return either all chunks from same document or top_k results
|
147 |
+
if len(all_chunks_same_doc) > 0:
|
148 |
+
return all_chunks_same_doc[:top_k]
|
149 |
+
|
150 |
+
return formatted_results[:top_k]
|
151 |
|
|
|
|
|
152 |
except Exception as e:
|
153 |
logging.error(f"Error performing similarity search in ChromaDB: {str(e)}")
|
154 |
raise
|
Binary file (9.81 kB). View file
|
|
Binary file (30.4 kB). View file
|
|