Spaces:
Running
Running
Enhanced the support for the excel file and added endpoint to have optimized vector store and Rag for the Excel.
b953016
# src/agents/excel_aware_rag.py | |
from typing import List, Dict, Optional, Set | |
from src.utils.logger import logger | |
class ExcelAwareRAGAgent: | |
"""Extension of RAGAgent with enhanced Excel handling""" | |
def _process_excel_context(self, context_docs: List[str], query: str) -> List[str]: | |
""" | |
Process and enhance context for Excel-related queries | |
Args: | |
context_docs (List[str]): Original context documents | |
query (str): User query | |
Returns: | |
List[str]: Enhanced context documents | |
""" | |
excel_context = [] | |
for doc in context_docs: | |
if 'Sheet:' in doc: # Identify Excel content | |
# Extract relevant sheet context based on query | |
relevant_sheets = self._identify_relevant_sheets(doc, query) | |
for sheet in relevant_sheets: | |
sheet_context = self._extract_sheet_context(doc, sheet) | |
if sheet_context: | |
excel_context.append(sheet_context) | |
# Add relationship context if query suggests multi-sheet analysis | |
if self._needs_relationship_context(query): | |
relationship_context = self._extract_relationship_context(doc) | |
if relationship_context: | |
excel_context.append(relationship_context) | |
else: | |
excel_context.append(doc) | |
return excel_context | |
def _identify_relevant_sheets(self, doc: str, query: str) -> List[str]: | |
"""Identify sheets relevant to the query""" | |
sheets = [] | |
current_sheet = None | |
# Extract sheet names from the document | |
for line in doc.split('\n'): | |
if line.startswith('Sheet: '): | |
current_sheet = line.replace('Sheet: ', '').strip() | |
# Check if sheet name or its contents are relevant to query | |
if self._is_relevant_to_query(current_sheet, query): | |
sheets.append(current_sheet) | |
return sheets | |
def _is_relevant_to_query(self, sheet_name: str, query: str) -> bool: | |
"""Check if a sheet is relevant to the query""" | |
# Convert to lower case for comparison | |
query_lower = query.lower() | |
sheet_lower = sheet_name.lower() | |
# Direct mention of sheet name | |
if sheet_lower in query_lower: | |
return True | |
# Check for related terms | |
sheet_terms = set(sheet_lower.split()) | |
query_terms = set(query_lower.split()) | |
# If there's significant term overlap | |
common_terms = sheet_terms.intersection(query_terms) | |
if len(common_terms) > 0: | |
return True | |
return False | |
def _extract_sheet_context(self, doc: str, sheet_name: str) -> Optional[str]: | |
"""Extract context for a specific sheet""" | |
lines = doc.split('\n') | |
sheet_context = [] | |
in_target_sheet = False | |
for line in lines: | |
if line.startswith(f'Sheet: {sheet_name}'): | |
in_target_sheet = True | |
sheet_context.append(line) | |
elif line.startswith('Sheet: '): | |
in_target_sheet = False | |
elif in_target_sheet: | |
sheet_context.append(line) | |
return '\n'.join(sheet_context) if sheet_context else None | |
def _needs_relationship_context(self, query: str) -> bool: | |
"""Determine if query needs relationship context""" | |
relationship_indicators = [ | |
'compare', 'relationship', 'between', 'across', 'correlation', | |
'related', 'connection', 'link', 'join', 'combine', 'multiple sheets', | |
'all sheets', 'different sheets' | |
] | |
query_lower = query.lower() | |
return any(indicator in query_lower for indicator in relationship_indicators) | |
def _extract_relationship_context(self, doc: str) -> Optional[str]: | |
"""Extract relationship context from document""" | |
lines = doc.split('\n') | |
relationship_context = [] | |
in_relationships = False | |
for line in lines: | |
if 'Sheet Relationships:' in line: | |
in_relationships = True | |
relationship_context.append(line) | |
elif in_relationships and line.strip() and not line.startswith('Sheet: '): | |
relationship_context.append(line) | |
elif in_relationships and line.startswith('Sheet: '): | |
break | |
return '\n'.join(relationship_context) if relationship_context else None | |
async def enhance_excel_response( | |
self, | |
query: str, | |
response: str, | |
context_docs: List[str] | |
) -> str: | |
""" | |
Enhance response for Excel-related queries | |
Args: | |
query (str): Original query | |
response (str): Generated response | |
context_docs (List[str]): Context documents | |
Returns: | |
str: Enhanced response | |
""" | |
if not any('Sheet:' in doc for doc in context_docs): | |
return response | |
try: | |
# Enhance response with specific Excel insights | |
enhanced_parts = [response] | |
# Add sheet-specific insights if relevant | |
if self._needs_sheet_specific_insights(query): | |
insights = self._generate_sheet_insights(query, context_docs) | |
if insights: | |
enhanced_parts.append("\nAdditional Sheet Insights:") | |
enhanced_parts.extend(insights) | |
# Add relationship insights if relevant | |
if self._needs_relationship_context(query): | |
relationship_insights = self._generate_relationship_insights(context_docs) | |
if relationship_insights: | |
enhanced_parts.append("\nSheet Relationship Insights:") | |
enhanced_parts.extend(relationship_insights) | |
return "\n".join(enhanced_parts) | |
except Exception as e: | |
logger.error(f"Error enhancing Excel response: {str(e)}") | |
return response # Fall back to original response if enhancement fails | |
def _needs_sheet_specific_insights(self, query: str) -> bool: | |
"""Determine if query needs sheet-specific insights""" | |
insight_indicators = [ | |
'analyze', 'summarize', 'tell me about', 'what is in', | |
'show me', 'describe', 'explain', 'give me details' | |
] | |
query_lower = query.lower() | |
return any(indicator in query_lower for indicator in insight_indicators) | |
def _generate_sheet_insights(self, query: str, context_docs: List[str]) -> List[str]: | |
"""Generate insights for relevant sheets""" | |
insights = [] | |
relevant_sheets = set() | |
# Collect relevant sheets from context | |
for doc in context_docs: | |
if 'Sheet:' in doc: | |
sheets = self._identify_relevant_sheets(doc, query) | |
relevant_sheets.update(sheets) | |
# Generate insights for each relevant sheet | |
for sheet in relevant_sheets: | |
sheet_insights = self._generate_single_sheet_insights(sheet, context_docs) | |
if sheet_insights: | |
insights.extend(sheet_insights) | |
return insights | |
def _generate_single_sheet_insights( | |
self, | |
sheet_name: str, | |
context_docs: List[str] | |
) -> List[str]: | |
"""Generate insights for a single sheet""" | |
insights = [] | |
sheet_context = None | |
# Find context for this sheet | |
for doc in context_docs: | |
if f'Sheet: {sheet_name}' in doc: | |
sheet_context = self._extract_sheet_context(doc, sheet_name) | |
break | |
if not sheet_context: | |
return insights | |
# Extract and summarize key information | |
if 'Numeric Columns Summary:' in sheet_context: | |
numeric_insights = self._extract_numeric_insights(sheet_context) | |
if numeric_insights: | |
insights.extend(numeric_insights) | |
if 'Categorical Columns Summary:' in sheet_context: | |
categorical_insights = self._extract_categorical_insights(sheet_context) | |
if categorical_insights: | |
insights.extend(categorical_insights) | |
return insights | |
def _generate_relationship_insights(self, context_docs: List[str]) -> List[str]: | |
"""Generate insights about relationships between sheets""" | |
insights = [] | |
for doc in context_docs: | |
relationship_context = self._extract_relationship_context(doc) | |
if relationship_context: | |
# Process and format relationship information | |
relationships = relationship_context.split('\n')[1:] # Skip header | |
for rel in relationships: | |
if rel.strip(): | |
insights.append(f"- {rel.strip()}") | |
return insights |