File size: 11,983 Bytes
b953016
 
 
 
be32fd8
 
 
b953016
 
 
 
 
 
 
 
 
 
be32fd8
b953016
 
 
 
 
 
 
be32fd8
b953016
 
 
be32fd8
 
 
b953016
be32fd8
 
 
 
 
 
 
 
 
 
 
 
 
b953016
 
be32fd8
 
 
 
 
b953016
be32fd8
 
 
 
 
b953016
 
 
 
 
be32fd8
b953016
be32fd8
 
b953016
 
 
 
 
 
be32fd8
 
 
b953016
 
be32fd8
b953016
 
be32fd8
b953016
be32fd8
 
 
 
 
 
b953016
 
 
be32fd8
b953016
 
be32fd8
 
 
 
 
 
 
 
 
 
b953016
 
 
 
be32fd8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b953016
be32fd8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b953016
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
be32fd8
 
 
 
 
 
 
 
 
 
 
 
 
b953016
be32fd8
b953016
 
 
 
 
be32fd8
b953016
be32fd8
b953016
be32fd8
b953016
be32fd8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b953016
be32fd8
 
 
 
b953016
 
be32fd8
b953016
 
 
 
 
 
 
 
 
 
 
 
 
be32fd8
b953016
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
from typing import Dict, List, Any, Optional
import pandas as pd
import numpy as np
from pathlib import Path
import logging
from openpyxl import load_workbook
from openpyxl.utils.cell import get_column_letter

class EnhancedExcelProcessor:
    def __init__(self):
        """Initialize the enhanced Excel processor"""
        self.sheet_summaries = {}
        self.relationships = {}
        self.sheet_metadata = {}
        
    def process_excel(self, file_path: Path) -> str:
        """
        Process Excel file with enhanced data extraction
        
        Args:
            file_path (Path): Path to Excel file
            
        Returns:
            str: Structured text representation of Excel content
        """
        # Read all sheets with improved handling
        excel_file = pd.ExcelFile(file_path)
        sheets_data = {}
        
        # Load workbook for additional metadata
        workbook = load_workbook(file_path, data_only=True)
        
        for sheet_name in excel_file.sheet_names:
            # Read with pandas for data structure
            df = pd.read_excel(
                excel_file, 
                sheet_name=sheet_name,
                header=None  # Read without assuming header to capture all data
            )
            
            # Clean column names
            if df.iloc[0].notna().any():  # If first row has any data
                df.columns = [f"Column_{i}" if pd.isna(x) else str(x).strip() 
                            for i, x in enumerate(df.iloc[0])]
                df = df.iloc[1:]  # Remove header row from data
            
            sheets_data[sheet_name] = df
            
            # Generate enhanced sheet summary
            self.sheet_summaries[sheet_name] = self._generate_enhanced_sheet_summary(
                df,
                workbook[sheet_name]
            )
            
            # Extract enhanced sheet metadata
            self.sheet_metadata[sheet_name] = self._extract_enhanced_metadata(
                df,
                workbook[sheet_name]
            )
        
        # Detect relationships between sheets
        self.relationships = self._detect_relationships(sheets_data)
        
        # Generate structured text representation
        return self._generate_enhanced_structured_text(sheets_data, workbook)
    
    def _generate_enhanced_sheet_summary(self, df: pd.DataFrame, ws) -> Dict:
        """Generate comprehensive statistical summary for a sheet"""
        summary = {
            'total_rows': len(df),
            'total_columns': len(df.columns),
            'column_types': {},
            'numeric_summaries': {},
            'categorical_summaries': {},
            'null_counts': df.isnull().sum().to_dict(),
            'merged_cells': self._get_merged_cells_info(ws),
            'formulas': self._get_formulas_info(ws)
        }
        
        # Process numeric columns with enhanced detection
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            col_data = pd.to_numeric(df[col], errors='coerce')
            summary['numeric_summaries'][col] = {
                'mean': float(col_data.mean()) if not col_data.empty else None,
                'median': float(col_data.median()) if not col_data.empty else None,
                'std': float(col_data.std()) if not col_data.empty else None,
                'min': float(col_data.min()) if not col_data.empty else None,
                'max': float(col_data.max()) if not col_data.empty else None,
                'sum': float(col_data.sum()) if not col_data.empty else None
            }
            summary['column_types'][col] = 'numeric'
        
        # Process categorical and text columns with enhanced analysis
        categorical_cols = df.select_dtypes(include=['object']).columns
        for col in categorical_cols:
            # Clean and process values
            values = df[col].astype(str).replace('nan', pd.NA).dropna()
            if not values.empty:
                value_counts = values.value_counts()
                summary['categorical_summaries'][col] = {
                    'unique_values': int(len(value_counts)),
                    'top_values': value_counts.head(5).to_dict(),
                    'contains_currency': self._detect_currency(values),
                    'contains_dates': self._detect_dates(values)
                }
            summary['column_types'][col] = 'categorical'
        
        return summary
    
    def _extract_enhanced_metadata(self, df: pd.DataFrame, ws) -> Dict:
        """Extract comprehensive metadata including Excel-specific features"""
        metadata = {
            'columns': list(df.columns),
            'rows': len(df),
            'numeric_columns': df.select_dtypes(include=[np.number]).columns.tolist(),
            'date_columns': df.select_dtypes(include=['datetime64']).columns.tolist(),
            'categorical_columns': df.select_dtypes(include=['object']).columns.tolist(),
            'column_widths': {get_column_letter(i+1): ws.column_dimensions[get_column_letter(i+1)].width 
                            for i in range(len(df.columns)) 
                            if get_column_letter(i+1) in ws.column_dimensions},
            'hidden_rows': [idx for idx in range(1, ws.max_row + 1) if ws.row_dimensions[idx].hidden],
            'hidden_columns': [get_column_letter(idx) for idx in range(1, ws.max_column + 1) 
                             if ws.column_dimensions[get_column_letter(idx)].hidden],
            'has_charts': bool(ws._charts),
            'has_images': bool(ws._images),
            'frozen_panes': ws.freeze_panes is not None
        }
        return metadata
    
    def _get_merged_cells_info(self, ws) -> List[Dict]:
        """Extract information about merged cells"""
        merged_cells = []
        for merged_range in ws.merged_cells.ranges:
            merged_cells.append({
                'range': str(merged_range),
                'start_cell': merged_range.start_cell.coordinate,
                'end_cell': merged_range.end_cell.coordinate
            })
        return merged_cells
    
    def _get_formulas_info(self, ws) -> Dict[str, str]:
        """Extract formulas from the worksheet"""
        formulas = {}
        for row in ws.iter_rows():
            for cell in row:
                if cell.formula:
                    formulas[cell.coordinate] = cell.formula
        return formulas
    
    def _detect_currency(self, series: pd.Series) -> bool:
        """Detect if a series contains currency values"""
        currency_patterns = ['$', '€', '£', '¥']
        return any(series.astype(str).str.contains('|'.join(currency_patterns)).any())
    
    def _detect_dates(self, series: pd.Series) -> bool:
        """Detect if a series contains date values"""
        try:
            pd.to_datetime(series, errors='raise')
            return True
        except:
            return False
    
    def _generate_enhanced_structured_text(self, sheets_data: Dict[str, pd.DataFrame], workbook) -> str:
        """Generate detailed structured text representation of Excel content"""
        output_parts = []
        
        # Overall summary
        output_parts.append(f"Excel File Overview:")
        output_parts.append(f"Total Sheets: {len(sheets_data)}")
        output_parts.append("")
        
        # Sheet details
        for sheet_name, df in sheets_data.items():
            output_parts.append(f"Sheet: {sheet_name}")
            output_parts.append("=" * (len(sheet_name) + 7))
            
            metadata = self.sheet_metadata[sheet_name]
            summary = self.sheet_summaries[sheet_name]
            
            # Basic info
            output_parts.append(f"Rows: {metadata['rows']}")
            output_parts.append(f"Columns: {', '.join(metadata['columns'])}")
            
            # Add information about hidden elements
            if metadata['hidden_rows']:
                output_parts.append(f"Hidden Rows: {len(metadata['hidden_rows'])}")
            if metadata['hidden_columns']:
                output_parts.append(f"Hidden Columns: {len(metadata['hidden_columns'])}")
            
            # Add information about merged cells
            if summary['merged_cells']:
                output_parts.append("\nMerged Cells:")
                for merge_info in summary['merged_cells'][:5]:  # Show first 5 merged ranges
                    output_parts.append(f"  - Range: {merge_info['range']}")
            
            # Numeric columns summary
            if metadata['numeric_columns']:
                output_parts.append("\nNumeric Columns Summary:")
                for col in metadata['numeric_columns']:
                    stats = summary['numeric_summaries'][col]
                    output_parts.append(f"  {col}:")
                    output_parts.append(f"    Range: {stats['min']} to {stats['max']}")
                    output_parts.append(f"    Average: {stats['mean']:.2f}")
                    output_parts.append(f"    Sum: {stats['sum']:.2f}")
            
            # Categorical columns summary
            if metadata['categorical_columns']:
                output_parts.append("\nCategorical Columns Summary:")
                for col in metadata['categorical_columns']:
                    if col in summary['categorical_summaries']:
                        cats = summary['categorical_summaries'][col]
                        output_parts.append(f"  {col}:")
                        output_parts.append(f"    Unique Values: {cats['unique_values']}")
                        if cats['top_values']:
                            output_parts.append("    Top Values: " + 
                                             ", ".join(f"{k} ({v})" for k, v in 
                                                     list(cats['top_values'].items())[:3]))
                        if cats['contains_currency']:
                            output_parts.append("    Contains Currency Values")
                        if cats['contains_dates']:
                            output_parts.append("    Contains Date Values")
            
            # Add formula information
            if summary['formulas']:
                output_parts.append("\nFormulas Present:")
                for cell, formula in list(summary['formulas'].items())[:5]:  # Show first 5 formulas
                    output_parts.append(f"  {cell}: {formula}")
            
            # Sample data with improved formatting
            output_parts.append("\nSample Data:")
            sample_data = df.head(5).fillna("").to_string(index=False)
            output_parts.append(sample_data)
            output_parts.append("\n")
        
        # Sheet relationships
        if self.relationships:
            output_parts.append("Sheet Relationships:")
            for rel_key, rel_info in self.relationships.items():
                if rel_info['type'] == 'potential_join':
                    sheets = rel_key.split('__')
                    output_parts.append(f"- {sheets[0]} and {sheets[1]} share columns: " +
                                     f"{', '.join(rel_info['common_columns'])}")
                elif rel_info['type'] == 'foreign_key':
                    parts = rel_key.split('__')
                    output_parts.append(f"- Potential foreign key relationship between " +
                                     f"{parts[0]}.{parts[2]} and {parts[1]}.{parts[3]}")
        
        return "\n".join(output_parts)

    def get_sheet_summary(self, sheet_name: str) -> Optional[Dict]:
        """Get summary for a specific sheet"""
        return self.sheet_summaries.get(sheet_name)
    
    def get_relationships(self) -> Dict:
        """Get detected relationships between sheets"""
        return self.relationships
    
    def get_metadata(self) -> Dict:
        """Get complete metadata for all sheets"""
        return self.sheet_metadata