Spaces:
Running
Running
from typing import Dict, List, Any, Optional | |
import pandas as pd | |
import numpy as np | |
from pathlib import Path | |
import logging | |
from openpyxl import load_workbook | |
from openpyxl.utils.cell import get_column_letter | |
class EnhancedExcelProcessor: | |
def __init__(self): | |
"""Initialize the enhanced Excel processor""" | |
self.sheet_summaries = {} | |
self.relationships = {} | |
self.sheet_metadata = {} | |
def process_excel(self, file_path: Path) -> str: | |
""" | |
Process Excel file with enhanced data extraction | |
Args: | |
file_path (Path): Path to Excel file | |
Returns: | |
str: Structured text representation of Excel content | |
""" | |
# Read all sheets with improved handling | |
excel_file = pd.ExcelFile(file_path) | |
sheets_data = {} | |
# Load workbook for additional metadata | |
workbook = load_workbook(file_path, data_only=True) | |
for sheet_name in excel_file.sheet_names: | |
# Read with pandas for data structure | |
df = pd.read_excel( | |
excel_file, | |
sheet_name=sheet_name, | |
header=None # Read without assuming header to capture all data | |
) | |
# Clean column names | |
if df.iloc[0].notna().any(): # If first row has any data | |
df.columns = [f"Column_{i}" if pd.isna(x) else str(x).strip() | |
for i, x in enumerate(df.iloc[0])] | |
df = df.iloc[1:] # Remove header row from data | |
sheets_data[sheet_name] = df | |
# Generate enhanced sheet summary | |
self.sheet_summaries[sheet_name] = self._generate_enhanced_sheet_summary( | |
df, | |
workbook[sheet_name] | |
) | |
# Extract enhanced sheet metadata | |
self.sheet_metadata[sheet_name] = self._extract_enhanced_metadata( | |
df, | |
workbook[sheet_name] | |
) | |
# Detect relationships between sheets | |
self.relationships = self._detect_relationships(sheets_data) | |
# Generate structured text representation | |
return self._generate_enhanced_structured_text(sheets_data, workbook) | |
def _generate_enhanced_sheet_summary(self, df: pd.DataFrame, ws) -> Dict: | |
"""Generate comprehensive statistical summary for a sheet""" | |
summary = { | |
'total_rows': len(df), | |
'total_columns': len(df.columns), | |
'column_types': {}, | |
'numeric_summaries': {}, | |
'categorical_summaries': {}, | |
'null_counts': df.isnull().sum().to_dict(), | |
'merged_cells': self._get_merged_cells_info(ws), | |
'formulas': self._get_formulas_info(ws) | |
} | |
# Process numeric columns with enhanced detection | |
numeric_cols = df.select_dtypes(include=[np.number]).columns | |
for col in numeric_cols: | |
col_data = pd.to_numeric(df[col], errors='coerce') | |
summary['numeric_summaries'][col] = { | |
'mean': float(col_data.mean()) if not col_data.empty else None, | |
'median': float(col_data.median()) if not col_data.empty else None, | |
'std': float(col_data.std()) if not col_data.empty else None, | |
'min': float(col_data.min()) if not col_data.empty else None, | |
'max': float(col_data.max()) if not col_data.empty else None, | |
'sum': float(col_data.sum()) if not col_data.empty else None | |
} | |
summary['column_types'][col] = 'numeric' | |
# Process categorical and text columns with enhanced analysis | |
categorical_cols = df.select_dtypes(include=['object']).columns | |
for col in categorical_cols: | |
# Clean and process values | |
values = df[col].astype(str).replace('nan', pd.NA).dropna() | |
if not values.empty: | |
value_counts = values.value_counts() | |
summary['categorical_summaries'][col] = { | |
'unique_values': int(len(value_counts)), | |
'top_values': value_counts.head(5).to_dict(), | |
'contains_currency': self._detect_currency(values), | |
'contains_dates': self._detect_dates(values) | |
} | |
summary['column_types'][col] = 'categorical' | |
return summary | |
def _extract_enhanced_metadata(self, df: pd.DataFrame, ws) -> Dict: | |
"""Extract comprehensive metadata including Excel-specific features""" | |
metadata = { | |
'columns': list(df.columns), | |
'rows': len(df), | |
'numeric_columns': df.select_dtypes(include=[np.number]).columns.tolist(), | |
'date_columns': df.select_dtypes(include=['datetime64']).columns.tolist(), | |
'categorical_columns': df.select_dtypes(include=['object']).columns.tolist(), | |
'column_widths': {get_column_letter(i+1): ws.column_dimensions[get_column_letter(i+1)].width | |
for i in range(len(df.columns)) | |
if get_column_letter(i+1) in ws.column_dimensions}, | |
'hidden_rows': [idx for idx in range(1, ws.max_row + 1) if ws.row_dimensions[idx].hidden], | |
'hidden_columns': [get_column_letter(idx) for idx in range(1, ws.max_column + 1) | |
if ws.column_dimensions[get_column_letter(idx)].hidden], | |
'has_charts': bool(ws._charts), | |
'has_images': bool(ws._images), | |
'frozen_panes': ws.freeze_panes is not None | |
} | |
return metadata | |
def _get_merged_cells_info(self, ws) -> List[Dict]: | |
"""Extract information about merged cells""" | |
merged_cells = [] | |
for merged_range in ws.merged_cells.ranges: | |
merged_cells.append({ | |
'range': str(merged_range), | |
'start_cell': merged_range.start_cell.coordinate, | |
'end_cell': merged_range.end_cell.coordinate | |
}) | |
return merged_cells | |
def _get_formulas_info(self, ws) -> Dict[str, str]: | |
"""Extract formulas from the worksheet""" | |
formulas = {} | |
for row in ws.iter_rows(): | |
for cell in row: | |
if cell.formula: | |
formulas[cell.coordinate] = cell.formula | |
return formulas | |
def _detect_currency(self, series: pd.Series) -> bool: | |
"""Detect if a series contains currency values""" | |
currency_patterns = ['$', '€', '£', '¥'] | |
return any(series.astype(str).str.contains('|'.join(currency_patterns)).any()) | |
def _detect_dates(self, series: pd.Series) -> bool: | |
"""Detect if a series contains date values""" | |
try: | |
pd.to_datetime(series, errors='raise') | |
return True | |
except: | |
return False | |
def _generate_enhanced_structured_text(self, sheets_data: Dict[str, pd.DataFrame], workbook) -> str: | |
"""Generate detailed structured text representation of Excel content""" | |
output_parts = [] | |
# Overall summary | |
output_parts.append(f"Excel File Overview:") | |
output_parts.append(f"Total Sheets: {len(sheets_data)}") | |
output_parts.append("") | |
# Sheet details | |
for sheet_name, df in sheets_data.items(): | |
output_parts.append(f"Sheet: {sheet_name}") | |
output_parts.append("=" * (len(sheet_name) + 7)) | |
metadata = self.sheet_metadata[sheet_name] | |
summary = self.sheet_summaries[sheet_name] | |
# Basic info | |
output_parts.append(f"Rows: {metadata['rows']}") | |
output_parts.append(f"Columns: {', '.join(metadata['columns'])}") | |
# Add information about hidden elements | |
if metadata['hidden_rows']: | |
output_parts.append(f"Hidden Rows: {len(metadata['hidden_rows'])}") | |
if metadata['hidden_columns']: | |
output_parts.append(f"Hidden Columns: {len(metadata['hidden_columns'])}") | |
# Add information about merged cells | |
if summary['merged_cells']: | |
output_parts.append("\nMerged Cells:") | |
for merge_info in summary['merged_cells'][:5]: # Show first 5 merged ranges | |
output_parts.append(f" - Range: {merge_info['range']}") | |
# Numeric columns summary | |
if metadata['numeric_columns']: | |
output_parts.append("\nNumeric Columns Summary:") | |
for col in metadata['numeric_columns']: | |
stats = summary['numeric_summaries'][col] | |
output_parts.append(f" {col}:") | |
output_parts.append(f" Range: {stats['min']} to {stats['max']}") | |
output_parts.append(f" Average: {stats['mean']:.2f}") | |
output_parts.append(f" Sum: {stats['sum']:.2f}") | |
# Categorical columns summary | |
if metadata['categorical_columns']: | |
output_parts.append("\nCategorical Columns Summary:") | |
for col in metadata['categorical_columns']: | |
if col in summary['categorical_summaries']: | |
cats = summary['categorical_summaries'][col] | |
output_parts.append(f" {col}:") | |
output_parts.append(f" Unique Values: {cats['unique_values']}") | |
if cats['top_values']: | |
output_parts.append(" Top Values: " + | |
", ".join(f"{k} ({v})" for k, v in | |
list(cats['top_values'].items())[:3])) | |
if cats['contains_currency']: | |
output_parts.append(" Contains Currency Values") | |
if cats['contains_dates']: | |
output_parts.append(" Contains Date Values") | |
# Add formula information | |
if summary['formulas']: | |
output_parts.append("\nFormulas Present:") | |
for cell, formula in list(summary['formulas'].items())[:5]: # Show first 5 formulas | |
output_parts.append(f" {cell}: {formula}") | |
# Sample data with improved formatting | |
output_parts.append("\nSample Data:") | |
sample_data = df.head(5).fillna("").to_string(index=False) | |
output_parts.append(sample_data) | |
output_parts.append("\n") | |
# Sheet relationships | |
if self.relationships: | |
output_parts.append("Sheet Relationships:") | |
for rel_key, rel_info in self.relationships.items(): | |
if rel_info['type'] == 'potential_join': | |
sheets = rel_key.split('__') | |
output_parts.append(f"- {sheets[0]} and {sheets[1]} share columns: " + | |
f"{', '.join(rel_info['common_columns'])}") | |
elif rel_info['type'] == 'foreign_key': | |
parts = rel_key.split('__') | |
output_parts.append(f"- Potential foreign key relationship between " + | |
f"{parts[0]}.{parts[2]} and {parts[1]}.{parts[3]}") | |
return "\n".join(output_parts) | |
def get_sheet_summary(self, sheet_name: str) -> Optional[Dict]: | |
"""Get summary for a specific sheet""" | |
return self.sheet_summaries.get(sheet_name) | |
def get_relationships(self) -> Dict: | |
"""Get detected relationships between sheets""" | |
return self.relationships | |
def get_metadata(self) -> Dict: | |
"""Get complete metadata for all sheets""" | |
return self.sheet_metadata |