chatbot-backend / src /agents /excel_aware_rag.py
TalatMasood's picture
Enhanced the support for the excel file and added endpoint to have optimized vector store and Rag for the Excel.
b953016
raw
history blame
9.27 kB
# src/agents/excel_aware_rag.py
from typing import List, Dict, Optional, Set
from src.utils.logger import logger
class ExcelAwareRAGAgent:
"""Extension of RAGAgent with enhanced Excel handling"""
def _process_excel_context(self, context_docs: List[str], query: str) -> List[str]:
"""
Process and enhance context for Excel-related queries
Args:
context_docs (List[str]): Original context documents
query (str): User query
Returns:
List[str]: Enhanced context documents
"""
excel_context = []
for doc in context_docs:
if 'Sheet:' in doc: # Identify Excel content
# Extract relevant sheet context based on query
relevant_sheets = self._identify_relevant_sheets(doc, query)
for sheet in relevant_sheets:
sheet_context = self._extract_sheet_context(doc, sheet)
if sheet_context:
excel_context.append(sheet_context)
# Add relationship context if query suggests multi-sheet analysis
if self._needs_relationship_context(query):
relationship_context = self._extract_relationship_context(doc)
if relationship_context:
excel_context.append(relationship_context)
else:
excel_context.append(doc)
return excel_context
def _identify_relevant_sheets(self, doc: str, query: str) -> List[str]:
"""Identify sheets relevant to the query"""
sheets = []
current_sheet = None
# Extract sheet names from the document
for line in doc.split('\n'):
if line.startswith('Sheet: '):
current_sheet = line.replace('Sheet: ', '').strip()
# Check if sheet name or its contents are relevant to query
if self._is_relevant_to_query(current_sheet, query):
sheets.append(current_sheet)
return sheets
def _is_relevant_to_query(self, sheet_name: str, query: str) -> bool:
"""Check if a sheet is relevant to the query"""
# Convert to lower case for comparison
query_lower = query.lower()
sheet_lower = sheet_name.lower()
# Direct mention of sheet name
if sheet_lower in query_lower:
return True
# Check for related terms
sheet_terms = set(sheet_lower.split())
query_terms = set(query_lower.split())
# If there's significant term overlap
common_terms = sheet_terms.intersection(query_terms)
if len(common_terms) > 0:
return True
return False
def _extract_sheet_context(self, doc: str, sheet_name: str) -> Optional[str]:
"""Extract context for a specific sheet"""
lines = doc.split('\n')
sheet_context = []
in_target_sheet = False
for line in lines:
if line.startswith(f'Sheet: {sheet_name}'):
in_target_sheet = True
sheet_context.append(line)
elif line.startswith('Sheet: '):
in_target_sheet = False
elif in_target_sheet:
sheet_context.append(line)
return '\n'.join(sheet_context) if sheet_context else None
def _needs_relationship_context(self, query: str) -> bool:
"""Determine if query needs relationship context"""
relationship_indicators = [
'compare', 'relationship', 'between', 'across', 'correlation',
'related', 'connection', 'link', 'join', 'combine', 'multiple sheets',
'all sheets', 'different sheets'
]
query_lower = query.lower()
return any(indicator in query_lower for indicator in relationship_indicators)
def _extract_relationship_context(self, doc: str) -> Optional[str]:
"""Extract relationship context from document"""
lines = doc.split('\n')
relationship_context = []
in_relationships = False
for line in lines:
if 'Sheet Relationships:' in line:
in_relationships = True
relationship_context.append(line)
elif in_relationships and line.strip() and not line.startswith('Sheet: '):
relationship_context.append(line)
elif in_relationships and line.startswith('Sheet: '):
break
return '\n'.join(relationship_context) if relationship_context else None
async def enhance_excel_response(
self,
query: str,
response: str,
context_docs: List[str]
) -> str:
"""
Enhance response for Excel-related queries
Args:
query (str): Original query
response (str): Generated response
context_docs (List[str]): Context documents
Returns:
str: Enhanced response
"""
if not any('Sheet:' in doc for doc in context_docs):
return response
try:
# Enhance response with specific Excel insights
enhanced_parts = [response]
# Add sheet-specific insights if relevant
if self._needs_sheet_specific_insights(query):
insights = self._generate_sheet_insights(query, context_docs)
if insights:
enhanced_parts.append("\nAdditional Sheet Insights:")
enhanced_parts.extend(insights)
# Add relationship insights if relevant
if self._needs_relationship_context(query):
relationship_insights = self._generate_relationship_insights(context_docs)
if relationship_insights:
enhanced_parts.append("\nSheet Relationship Insights:")
enhanced_parts.extend(relationship_insights)
return "\n".join(enhanced_parts)
except Exception as e:
logger.error(f"Error enhancing Excel response: {str(e)}")
return response # Fall back to original response if enhancement fails
def _needs_sheet_specific_insights(self, query: str) -> bool:
"""Determine if query needs sheet-specific insights"""
insight_indicators = [
'analyze', 'summarize', 'tell me about', 'what is in',
'show me', 'describe', 'explain', 'give me details'
]
query_lower = query.lower()
return any(indicator in query_lower for indicator in insight_indicators)
def _generate_sheet_insights(self, query: str, context_docs: List[str]) -> List[str]:
"""Generate insights for relevant sheets"""
insights = []
relevant_sheets = set()
# Collect relevant sheets from context
for doc in context_docs:
if 'Sheet:' in doc:
sheets = self._identify_relevant_sheets(doc, query)
relevant_sheets.update(sheets)
# Generate insights for each relevant sheet
for sheet in relevant_sheets:
sheet_insights = self._generate_single_sheet_insights(sheet, context_docs)
if sheet_insights:
insights.extend(sheet_insights)
return insights
def _generate_single_sheet_insights(
self,
sheet_name: str,
context_docs: List[str]
) -> List[str]:
"""Generate insights for a single sheet"""
insights = []
sheet_context = None
# Find context for this sheet
for doc in context_docs:
if f'Sheet: {sheet_name}' in doc:
sheet_context = self._extract_sheet_context(doc, sheet_name)
break
if not sheet_context:
return insights
# Extract and summarize key information
if 'Numeric Columns Summary:' in sheet_context:
numeric_insights = self._extract_numeric_insights(sheet_context)
if numeric_insights:
insights.extend(numeric_insights)
if 'Categorical Columns Summary:' in sheet_context:
categorical_insights = self._extract_categorical_insights(sheet_context)
if categorical_insights:
insights.extend(categorical_insights)
return insights
def _generate_relationship_insights(self, context_docs: List[str]) -> List[str]:
"""Generate insights about relationships between sheets"""
insights = []
for doc in context_docs:
relationship_context = self._extract_relationship_context(doc)
if relationship_context:
# Process and format relationship information
relationships = relationship_context.split('\n')[1:] # Skip header
for rel in relationships:
if rel.strip():
insights.append(f"- {rel.strip()}")
return insights