from typing import Dict, List, Any, Optional import pandas as pd import numpy as np from pathlib import Path import json class EnhancedExcelProcessor: def __init__(self): """Initialize the enhanced Excel processor""" self.sheet_summaries = {} self.relationships = {} self.sheet_metadata = {} def process_excel(self, file_path: Path) -> str: """ Process Excel file with enhanced multi-sheet handling Args: file_path (Path): Path to Excel file Returns: str: Structured text representation of Excel content """ # Read all sheets excel_file = pd.ExcelFile(file_path) sheets_data = {} for sheet_name in excel_file.sheet_names: df = pd.read_excel(excel_file, sheet_name=sheet_name) sheets_data[sheet_name] = df # Generate sheet summary self.sheet_summaries[sheet_name] = self._generate_sheet_summary(df) # Extract sheet metadata self.sheet_metadata[sheet_name] = { 'columns': list(df.columns), 'rows': len(df), 'numeric_columns': df.select_dtypes(include=[np.number]).columns.tolist(), 'date_columns': df.select_dtypes(include=['datetime64']).columns.tolist(), 'categorical_columns': df.select_dtypes(include=['object']).columns.tolist() } # Detect relationships between sheets self.relationships = self._detect_relationships(sheets_data) # Generate structured text representation return self._generate_structured_text(sheets_data) def _generate_sheet_summary(self, df: pd.DataFrame) -> Dict: """Generate statistical summary for a sheet""" summary = { 'total_rows': len(df), 'total_columns': len(df.columns), 'column_types': {}, 'numeric_summaries': {}, 'categorical_summaries': {}, 'null_counts': df.isnull().sum().to_dict() } # Process numeric columns numeric_cols = df.select_dtypes(include=[np.number]).columns for col in numeric_cols: summary['numeric_summaries'][col] = { 'mean': float(df[col].mean()), 'median': float(df[col].median()), 'std': float(df[col].std()), 'min': float(df[col].min()), 'max': float(df[col].max()) } summary['column_types'][col] = 'numeric' # Process categorical columns categorical_cols = df.select_dtypes(include=['object']).columns for col in categorical_cols: value_counts = df[col].value_counts() summary['categorical_summaries'][col] = { 'unique_values': int(len(value_counts)), 'top_values': value_counts.head(5).to_dict() } summary['column_types'][col] = 'categorical' return summary def _detect_relationships(self, sheets_data: Dict[str, pd.DataFrame]) -> Dict: """Detect potential relationships between sheets""" relationships = {} sheet_names = list(sheets_data.keys()) for i, sheet1 in enumerate(sheet_names): for sheet2 in sheet_names[i+1:]: common_cols = set(sheets_data[sheet1].columns) & set(sheets_data[sheet2].columns) if common_cols: relationships[f"{sheet1}__{sheet2}"] = { 'common_columns': list(common_cols), 'type': 'potential_join' } # Check for foreign key relationships for col1 in sheets_data[sheet1].columns: for col2 in sheets_data[sheet2].columns: if (col1.lower().endswith('_id') or col2.lower().endswith('_id')): unique_vals1 = set(sheets_data[sheet1][col1].dropna()) unique_vals2 = set(sheets_data[sheet2][col2].dropna()) if unique_vals1 & unique_vals2: relationships[f"{sheet1}__{sheet2}__{col1}__{col2}"] = { 'type': 'foreign_key', 'columns': [col1, col2] } return relationships def _generate_structured_text(self, sheets_data: Dict[str, pd.DataFrame]) -> str: """Generate structured text representation of Excel content""" output_parts = [] # Overall summary output_parts.append(f"Excel File Overview:") output_parts.append(f"Total Sheets: {len(sheets_data)}") output_parts.append("") # Sheet details for sheet_name, df in sheets_data.items(): output_parts.append(f"Sheet: {sheet_name}") output_parts.append("=" * (len(sheet_name) + 7)) metadata = self.sheet_metadata[sheet_name] summary = self.sheet_summaries[sheet_name] # Basic info output_parts.append(f"Rows: {metadata['rows']}") output_parts.append(f"Columns: {', '.join(metadata['columns'])}") output_parts.append("") # Column summaries if metadata['numeric_columns']: output_parts.append("Numeric Columns Summary:") for col in metadata['numeric_columns']: stats = summary['numeric_summaries'][col] output_parts.append(f" {col}:") output_parts.append(f" Range: {stats['min']} to {stats['max']}") output_parts.append(f" Average: {stats['mean']:.2f}") output_parts.append("") if metadata['categorical_columns']: output_parts.append("Categorical Columns Summary:") for col in metadata['categorical_columns']: cats = summary['categorical_summaries'][col] output_parts.append(f" {col}:") output_parts.append(f" Unique Values: {cats['unique_values']}") if cats['top_values']: output_parts.append(" Top Values: " + ", ".join(f"{k} ({v})" for k, v in list(cats['top_values'].items())[:3])) output_parts.append("") # Sample data output_parts.append("Sample Data:") output_parts.append(df.head(3).to_string()) output_parts.append("\n") # Relationships if self.relationships: output_parts.append("Sheet Relationships:") for rel_key, rel_info in self.relationships.items(): if rel_info['type'] == 'potential_join': sheets = rel_key.split('__') output_parts.append(f"- {sheets[0]} and {sheets[1]} share columns: " + f"{', '.join(rel_info['common_columns'])}") elif rel_info['type'] == 'foreign_key': parts = rel_key.split('__') output_parts.append(f"- Potential foreign key relationship between " + f"{parts[0]}.{parts[2]} and {parts[1]}.{parts[3]}") return "\n".join(output_parts) def get_sheet_summary(self, sheet_name: str) -> Optional[Dict]: """Get summary for a specific sheet""" return self.sheet_summaries.get(sheet_name) def get_relationships(self) -> Dict: """Get detected relationships between sheets""" return self.relationships def get_metadata(self) -> Dict: """Get complete metadata for all sheets""" return self.sheet_metadata