File size: 9,274 Bytes
b953016
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
# src/agents/excel_aware_rag.py
from typing import List, Dict, Optional, Set
from src.utils.logger import logger

class ExcelAwareRAGAgent:
    """Extension of RAGAgent with enhanced Excel handling"""
    
    def _process_excel_context(self, context_docs: List[str], query: str) -> List[str]:
        """
        Process and enhance context for Excel-related queries
        
        Args:
            context_docs (List[str]): Original context documents
            query (str): User query
            
        Returns:
            List[str]: Enhanced context documents
        """
        excel_context = []
        
        for doc in context_docs:
            if 'Sheet:' in doc:  # Identify Excel content
                # Extract relevant sheet context based on query
                relevant_sheets = self._identify_relevant_sheets(doc, query)
                for sheet in relevant_sheets:
                    sheet_context = self._extract_sheet_context(doc, sheet)
                    if sheet_context:
                        excel_context.append(sheet_context)
                
                # Add relationship context if query suggests multi-sheet analysis
                if self._needs_relationship_context(query):
                    relationship_context = self._extract_relationship_context(doc)
                    if relationship_context:
                        excel_context.append(relationship_context)
            else:
                excel_context.append(doc)
        
        return excel_context

    def _identify_relevant_sheets(self, doc: str, query: str) -> List[str]:
        """Identify sheets relevant to the query"""
        sheets = []
        current_sheet = None
        
        # Extract sheet names from the document
        for line in doc.split('\n'):
            if line.startswith('Sheet: '):
                current_sheet = line.replace('Sheet: ', '').strip()
                # Check if sheet name or its contents are relevant to query
                if self._is_relevant_to_query(current_sheet, query):
                    sheets.append(current_sheet)
                    
        return sheets

    def _is_relevant_to_query(self, sheet_name: str, query: str) -> bool:
        """Check if a sheet is relevant to the query"""
        # Convert to lower case for comparison
        query_lower = query.lower()
        sheet_lower = sheet_name.lower()
        
        # Direct mention of sheet name
        if sheet_lower in query_lower:
            return True
            
        # Check for related terms
        sheet_terms = set(sheet_lower.split())
        query_terms = set(query_lower.split())
        
        # If there's significant term overlap
        common_terms = sheet_terms.intersection(query_terms)
        if len(common_terms) > 0:
            return True
            
        return False

    def _extract_sheet_context(self, doc: str, sheet_name: str) -> Optional[str]:
        """Extract context for a specific sheet"""
        lines = doc.split('\n')
        sheet_context = []
        in_target_sheet = False
        
        for line in lines:
            if line.startswith(f'Sheet: {sheet_name}'):
                in_target_sheet = True
                sheet_context.append(line)
            elif line.startswith('Sheet: '):
                in_target_sheet = False
            elif in_target_sheet:
                sheet_context.append(line)
        
        return '\n'.join(sheet_context) if sheet_context else None

    def _needs_relationship_context(self, query: str) -> bool:
        """Determine if query needs relationship context"""
        relationship_indicators = [
            'compare', 'relationship', 'between', 'across', 'correlation',
            'related', 'connection', 'link', 'join', 'combine', 'multiple sheets',
            'all sheets', 'different sheets'
        ]
        
        query_lower = query.lower()
        return any(indicator in query_lower for indicator in relationship_indicators)

    def _extract_relationship_context(self, doc: str) -> Optional[str]:
        """Extract relationship context from document"""
        lines = doc.split('\n')
        relationship_context = []
        in_relationships = False
        
        for line in lines:
            if 'Sheet Relationships:' in line:
                in_relationships = True
                relationship_context.append(line)
            elif in_relationships and line.strip() and not line.startswith('Sheet: '):
                relationship_context.append(line)
            elif in_relationships and line.startswith('Sheet: '):
                break
                
        return '\n'.join(relationship_context) if relationship_context else None

    async def enhance_excel_response(
        self,
        query: str,
        response: str,
        context_docs: List[str]
    ) -> str:
        """
        Enhance response for Excel-related queries
        
        Args:
            query (str): Original query
            response (str): Generated response
            context_docs (List[str]): Context documents
            
        Returns:
            str: Enhanced response
        """
        if not any('Sheet:' in doc for doc in context_docs):
            return response
            
        try:
            # Enhance response with specific Excel insights
            enhanced_parts = [response]
            
            # Add sheet-specific insights if relevant
            if self._needs_sheet_specific_insights(query):
                insights = self._generate_sheet_insights(query, context_docs)
                if insights:
                    enhanced_parts.append("\nAdditional Sheet Insights:")
                    enhanced_parts.extend(insights)
            
            # Add relationship insights if relevant
            if self._needs_relationship_context(query):
                relationship_insights = self._generate_relationship_insights(context_docs)
                if relationship_insights:
                    enhanced_parts.append("\nSheet Relationship Insights:")
                    enhanced_parts.extend(relationship_insights)
            
            return "\n".join(enhanced_parts)
        except Exception as e:
            logger.error(f"Error enhancing Excel response: {str(e)}")
            return response  # Fall back to original response if enhancement fails

    def _needs_sheet_specific_insights(self, query: str) -> bool:
        """Determine if query needs sheet-specific insights"""
        insight_indicators = [
            'analyze', 'summarize', 'tell me about', 'what is in',
            'show me', 'describe', 'explain', 'give me details'
        ]
        
        query_lower = query.lower()
        return any(indicator in query_lower for indicator in insight_indicators)

    def _generate_sheet_insights(self, query: str, context_docs: List[str]) -> List[str]:
        """Generate insights for relevant sheets"""
        insights = []
        relevant_sheets = set()
        
        # Collect relevant sheets from context
        for doc in context_docs:
            if 'Sheet:' in doc:
                sheets = self._identify_relevant_sheets(doc, query)
                relevant_sheets.update(sheets)
        
        # Generate insights for each relevant sheet
        for sheet in relevant_sheets:
            sheet_insights = self._generate_single_sheet_insights(sheet, context_docs)
            if sheet_insights:
                insights.extend(sheet_insights)
        
        return insights

    def _generate_single_sheet_insights(
        self,
        sheet_name: str,
        context_docs: List[str]
    ) -> List[str]:
        """Generate insights for a single sheet"""
        insights = []
        sheet_context = None
        
        # Find context for this sheet
        for doc in context_docs:
            if f'Sheet: {sheet_name}' in doc:
                sheet_context = self._extract_sheet_context(doc, sheet_name)
                break
        
        if not sheet_context:
            return insights
        
        # Extract and summarize key information
        if 'Numeric Columns Summary:' in sheet_context:
            numeric_insights = self._extract_numeric_insights(sheet_context)
            if numeric_insights:
                insights.extend(numeric_insights)
                
        if 'Categorical Columns Summary:' in sheet_context:
            categorical_insights = self._extract_categorical_insights(sheet_context)
            if categorical_insights:
                insights.extend(categorical_insights)
        
        return insights

    def _generate_relationship_insights(self, context_docs: List[str]) -> List[str]:
        """Generate insights about relationships between sheets"""
        insights = []
        
        for doc in context_docs:
            relationship_context = self._extract_relationship_context(doc)
            if relationship_context:
                # Process and format relationship information
                relationships = relationship_context.split('\n')[1:]  # Skip header
                for rel in relationships:
                    if rel.strip():
                        insights.append(f"- {rel.strip()}")
        
        return insights