from typing import Dict, List, Any, Optional import pandas as pd import numpy as np from pathlib import Path import logging from openpyxl import load_workbook from openpyxl.utils.cell import get_column_letter class EnhancedExcelProcessor: def __init__(self): """Initialize the enhanced Excel processor""" self.sheet_summaries = {} self.relationships = {} self.sheet_metadata = {} def process_excel(self, file_path: Path) -> str: """ Process Excel file with enhanced data extraction Args: file_path (Path): Path to Excel file Returns: str: Structured text representation of Excel content """ # Read all sheets with improved handling excel_file = pd.ExcelFile(file_path) sheets_data = {} # Load workbook for additional metadata workbook = load_workbook(file_path, data_only=True) for sheet_name in excel_file.sheet_names: # Read with pandas for data structure df = pd.read_excel( excel_file, sheet_name=sheet_name, header=None # Read without assuming header to capture all data ) # Clean column names if df.iloc[0].notna().any(): # If first row has any data df.columns = [f"Column_{i}" if pd.isna(x) else str(x).strip() for i, x in enumerate(df.iloc[0])] df = df.iloc[1:] # Remove header row from data sheets_data[sheet_name] = df # Generate enhanced sheet summary self.sheet_summaries[sheet_name] = self._generate_enhanced_sheet_summary( df, workbook[sheet_name] ) # Extract enhanced sheet metadata self.sheet_metadata[sheet_name] = self._extract_enhanced_metadata( df, workbook[sheet_name] ) # Detect relationships between sheets self.relationships = self._detect_relationships(sheets_data) # Generate structured text representation return self._generate_enhanced_structured_text(sheets_data, workbook) def _generate_enhanced_sheet_summary(self, df: pd.DataFrame, ws) -> Dict: """Generate comprehensive statistical summary for a sheet""" summary = { 'total_rows': len(df), 'total_columns': len(df.columns), 'column_types': {}, 'numeric_summaries': {}, 'categorical_summaries': {}, 'null_counts': df.isnull().sum().to_dict(), 'merged_cells': self._get_merged_cells_info(ws), 'formulas': self._get_formulas_info(ws) } # Process numeric columns with enhanced detection numeric_cols = df.select_dtypes(include=[np.number]).columns for col in numeric_cols: col_data = pd.to_numeric(df[col], errors='coerce') summary['numeric_summaries'][col] = { 'mean': float(col_data.mean()) if not col_data.empty else None, 'median': float(col_data.median()) if not col_data.empty else None, 'std': float(col_data.std()) if not col_data.empty else None, 'min': float(col_data.min()) if not col_data.empty else None, 'max': float(col_data.max()) if not col_data.empty else None, 'sum': float(col_data.sum()) if not col_data.empty else None } summary['column_types'][col] = 'numeric' # Process categorical and text columns with enhanced analysis categorical_cols = df.select_dtypes(include=['object']).columns for col in categorical_cols: # Clean and process values values = df[col].astype(str).replace('nan', pd.NA).dropna() if not values.empty: value_counts = values.value_counts() summary['categorical_summaries'][col] = { 'unique_values': int(len(value_counts)), 'top_values': value_counts.head(5).to_dict(), 'contains_currency': self._detect_currency(values), 'contains_dates': self._detect_dates(values) } summary['column_types'][col] = 'categorical' return summary def _extract_enhanced_metadata(self, df: pd.DataFrame, ws) -> Dict: """Extract comprehensive metadata including Excel-specific features""" metadata = { 'columns': list(df.columns), 'rows': len(df), 'numeric_columns': df.select_dtypes(include=[np.number]).columns.tolist(), 'date_columns': df.select_dtypes(include=['datetime64']).columns.tolist(), 'categorical_columns': df.select_dtypes(include=['object']).columns.tolist(), 'column_widths': {get_column_letter(i+1): ws.column_dimensions[get_column_letter(i+1)].width for i in range(len(df.columns)) if get_column_letter(i+1) in ws.column_dimensions}, 'hidden_rows': [idx for idx in range(1, ws.max_row + 1) if ws.row_dimensions[idx].hidden], 'hidden_columns': [get_column_letter(idx) for idx in range(1, ws.max_column + 1) if ws.column_dimensions[get_column_letter(idx)].hidden], 'has_charts': bool(ws._charts), 'has_images': bool(ws._images), 'frozen_panes': ws.freeze_panes is not None } return metadata def _get_merged_cells_info(self, ws) -> List[Dict]: """Extract information about merged cells""" merged_cells = [] for merged_range in ws.merged_cells.ranges: merged_cells.append({ 'range': str(merged_range), 'start_cell': merged_range.start_cell.coordinate, 'end_cell': merged_range.end_cell.coordinate }) return merged_cells def _get_formulas_info(self, ws) -> Dict[str, str]: """Extract formulas from the worksheet""" formulas = {} for row in ws.iter_rows(): for cell in row: if cell.formula: formulas[cell.coordinate] = cell.formula return formulas def _detect_currency(self, series: pd.Series) -> bool: """Detect if a series contains currency values""" currency_patterns = ['$', '€', '£', '¥'] return any(series.astype(str).str.contains('|'.join(currency_patterns)).any()) def _detect_dates(self, series: pd.Series) -> bool: """Detect if a series contains date values""" try: pd.to_datetime(series, errors='raise') return True except: return False def _generate_enhanced_structured_text(self, sheets_data: Dict[str, pd.DataFrame], workbook) -> str: """Generate detailed structured text representation of Excel content""" output_parts = [] # Overall summary output_parts.append(f"Excel File Overview:") output_parts.append(f"Total Sheets: {len(sheets_data)}") output_parts.append("") # Sheet details for sheet_name, df in sheets_data.items(): output_parts.append(f"Sheet: {sheet_name}") output_parts.append("=" * (len(sheet_name) + 7)) metadata = self.sheet_metadata[sheet_name] summary = self.sheet_summaries[sheet_name] # Basic info output_parts.append(f"Rows: {metadata['rows']}") output_parts.append(f"Columns: {', '.join(metadata['columns'])}") # Add information about hidden elements if metadata['hidden_rows']: output_parts.append(f"Hidden Rows: {len(metadata['hidden_rows'])}") if metadata['hidden_columns']: output_parts.append(f"Hidden Columns: {len(metadata['hidden_columns'])}") # Add information about merged cells if summary['merged_cells']: output_parts.append("\nMerged Cells:") for merge_info in summary['merged_cells'][:5]: # Show first 5 merged ranges output_parts.append(f" - Range: {merge_info['range']}") # Numeric columns summary if metadata['numeric_columns']: output_parts.append("\nNumeric Columns Summary:") for col in metadata['numeric_columns']: stats = summary['numeric_summaries'][col] output_parts.append(f" {col}:") output_parts.append(f" Range: {stats['min']} to {stats['max']}") output_parts.append(f" Average: {stats['mean']:.2f}") output_parts.append(f" Sum: {stats['sum']:.2f}") # Categorical columns summary if metadata['categorical_columns']: output_parts.append("\nCategorical Columns Summary:") for col in metadata['categorical_columns']: if col in summary['categorical_summaries']: cats = summary['categorical_summaries'][col] output_parts.append(f" {col}:") output_parts.append(f" Unique Values: {cats['unique_values']}") if cats['top_values']: output_parts.append(" Top Values: " + ", ".join(f"{k} ({v})" for k, v in list(cats['top_values'].items())[:3])) if cats['contains_currency']: output_parts.append(" Contains Currency Values") if cats['contains_dates']: output_parts.append(" Contains Date Values") # Add formula information if summary['formulas']: output_parts.append("\nFormulas Present:") for cell, formula in list(summary['formulas'].items())[:5]: # Show first 5 formulas output_parts.append(f" {cell}: {formula}") # Sample data with improved formatting output_parts.append("\nSample Data:") sample_data = df.head(5).fillna("").to_string(index=False) output_parts.append(sample_data) output_parts.append("\n") # Sheet relationships if self.relationships: output_parts.append("Sheet Relationships:") for rel_key, rel_info in self.relationships.items(): if rel_info['type'] == 'potential_join': sheets = rel_key.split('__') output_parts.append(f"- {sheets[0]} and {sheets[1]} share columns: " + f"{', '.join(rel_info['common_columns'])}") elif rel_info['type'] == 'foreign_key': parts = rel_key.split('__') output_parts.append(f"- Potential foreign key relationship between " + f"{parts[0]}.{parts[2]} and {parts[1]}.{parts[3]}") return "\n".join(output_parts) def get_sheet_summary(self, sheet_name: str) -> Optional[Dict]: """Get summary for a specific sheet""" return self.sheet_summaries.get(sheet_name) def get_relationships(self) -> Dict: """Get detected relationships between sheets""" return self.relationships def get_metadata(self) -> Dict: """Get complete metadata for all sheets""" return self.sheet_metadata