chatbot-backend / src /utils /enhanced_excel_processor.py
TalatMasood's picture
Changes to be committed:
be32fd8
raw
history blame
12 kB
from typing import Dict, List, Any, Optional
import pandas as pd
import numpy as np
from pathlib import Path
import logging
from openpyxl import load_workbook
from openpyxl.utils.cell import get_column_letter
class EnhancedExcelProcessor:
def __init__(self):
"""Initialize the enhanced Excel processor"""
self.sheet_summaries = {}
self.relationships = {}
self.sheet_metadata = {}
def process_excel(self, file_path: Path) -> str:
"""
Process Excel file with enhanced data extraction
Args:
file_path (Path): Path to Excel file
Returns:
str: Structured text representation of Excel content
"""
# Read all sheets with improved handling
excel_file = pd.ExcelFile(file_path)
sheets_data = {}
# Load workbook for additional metadata
workbook = load_workbook(file_path, data_only=True)
for sheet_name in excel_file.sheet_names:
# Read with pandas for data structure
df = pd.read_excel(
excel_file,
sheet_name=sheet_name,
header=None # Read without assuming header to capture all data
)
# Clean column names
if df.iloc[0].notna().any(): # If first row has any data
df.columns = [f"Column_{i}" if pd.isna(x) else str(x).strip()
for i, x in enumerate(df.iloc[0])]
df = df.iloc[1:] # Remove header row from data
sheets_data[sheet_name] = df
# Generate enhanced sheet summary
self.sheet_summaries[sheet_name] = self._generate_enhanced_sheet_summary(
df,
workbook[sheet_name]
)
# Extract enhanced sheet metadata
self.sheet_metadata[sheet_name] = self._extract_enhanced_metadata(
df,
workbook[sheet_name]
)
# Detect relationships between sheets
self.relationships = self._detect_relationships(sheets_data)
# Generate structured text representation
return self._generate_enhanced_structured_text(sheets_data, workbook)
def _generate_enhanced_sheet_summary(self, df: pd.DataFrame, ws) -> Dict:
"""Generate comprehensive statistical summary for a sheet"""
summary = {
'total_rows': len(df),
'total_columns': len(df.columns),
'column_types': {},
'numeric_summaries': {},
'categorical_summaries': {},
'null_counts': df.isnull().sum().to_dict(),
'merged_cells': self._get_merged_cells_info(ws),
'formulas': self._get_formulas_info(ws)
}
# Process numeric columns with enhanced detection
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
col_data = pd.to_numeric(df[col], errors='coerce')
summary['numeric_summaries'][col] = {
'mean': float(col_data.mean()) if not col_data.empty else None,
'median': float(col_data.median()) if not col_data.empty else None,
'std': float(col_data.std()) if not col_data.empty else None,
'min': float(col_data.min()) if not col_data.empty else None,
'max': float(col_data.max()) if not col_data.empty else None,
'sum': float(col_data.sum()) if not col_data.empty else None
}
summary['column_types'][col] = 'numeric'
# Process categorical and text columns with enhanced analysis
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
# Clean and process values
values = df[col].astype(str).replace('nan', pd.NA).dropna()
if not values.empty:
value_counts = values.value_counts()
summary['categorical_summaries'][col] = {
'unique_values': int(len(value_counts)),
'top_values': value_counts.head(5).to_dict(),
'contains_currency': self._detect_currency(values),
'contains_dates': self._detect_dates(values)
}
summary['column_types'][col] = 'categorical'
return summary
def _extract_enhanced_metadata(self, df: pd.DataFrame, ws) -> Dict:
"""Extract comprehensive metadata including Excel-specific features"""
metadata = {
'columns': list(df.columns),
'rows': len(df),
'numeric_columns': df.select_dtypes(include=[np.number]).columns.tolist(),
'date_columns': df.select_dtypes(include=['datetime64']).columns.tolist(),
'categorical_columns': df.select_dtypes(include=['object']).columns.tolist(),
'column_widths': {get_column_letter(i+1): ws.column_dimensions[get_column_letter(i+1)].width
for i in range(len(df.columns))
if get_column_letter(i+1) in ws.column_dimensions},
'hidden_rows': [idx for idx in range(1, ws.max_row + 1) if ws.row_dimensions[idx].hidden],
'hidden_columns': [get_column_letter(idx) for idx in range(1, ws.max_column + 1)
if ws.column_dimensions[get_column_letter(idx)].hidden],
'has_charts': bool(ws._charts),
'has_images': bool(ws._images),
'frozen_panes': ws.freeze_panes is not None
}
return metadata
def _get_merged_cells_info(self, ws) -> List[Dict]:
"""Extract information about merged cells"""
merged_cells = []
for merged_range in ws.merged_cells.ranges:
merged_cells.append({
'range': str(merged_range),
'start_cell': merged_range.start_cell.coordinate,
'end_cell': merged_range.end_cell.coordinate
})
return merged_cells
def _get_formulas_info(self, ws) -> Dict[str, str]:
"""Extract formulas from the worksheet"""
formulas = {}
for row in ws.iter_rows():
for cell in row:
if cell.formula:
formulas[cell.coordinate] = cell.formula
return formulas
def _detect_currency(self, series: pd.Series) -> bool:
"""Detect if a series contains currency values"""
currency_patterns = ['$', '€', '£', '¥']
return any(series.astype(str).str.contains('|'.join(currency_patterns)).any())
def _detect_dates(self, series: pd.Series) -> bool:
"""Detect if a series contains date values"""
try:
pd.to_datetime(series, errors='raise')
return True
except:
return False
def _generate_enhanced_structured_text(self, sheets_data: Dict[str, pd.DataFrame], workbook) -> str:
"""Generate detailed structured text representation of Excel content"""
output_parts = []
# Overall summary
output_parts.append(f"Excel File Overview:")
output_parts.append(f"Total Sheets: {len(sheets_data)}")
output_parts.append("")
# Sheet details
for sheet_name, df in sheets_data.items():
output_parts.append(f"Sheet: {sheet_name}")
output_parts.append("=" * (len(sheet_name) + 7))
metadata = self.sheet_metadata[sheet_name]
summary = self.sheet_summaries[sheet_name]
# Basic info
output_parts.append(f"Rows: {metadata['rows']}")
output_parts.append(f"Columns: {', '.join(metadata['columns'])}")
# Add information about hidden elements
if metadata['hidden_rows']:
output_parts.append(f"Hidden Rows: {len(metadata['hidden_rows'])}")
if metadata['hidden_columns']:
output_parts.append(f"Hidden Columns: {len(metadata['hidden_columns'])}")
# Add information about merged cells
if summary['merged_cells']:
output_parts.append("\nMerged Cells:")
for merge_info in summary['merged_cells'][:5]: # Show first 5 merged ranges
output_parts.append(f" - Range: {merge_info['range']}")
# Numeric columns summary
if metadata['numeric_columns']:
output_parts.append("\nNumeric Columns Summary:")
for col in metadata['numeric_columns']:
stats = summary['numeric_summaries'][col]
output_parts.append(f" {col}:")
output_parts.append(f" Range: {stats['min']} to {stats['max']}")
output_parts.append(f" Average: {stats['mean']:.2f}")
output_parts.append(f" Sum: {stats['sum']:.2f}")
# Categorical columns summary
if metadata['categorical_columns']:
output_parts.append("\nCategorical Columns Summary:")
for col in metadata['categorical_columns']:
if col in summary['categorical_summaries']:
cats = summary['categorical_summaries'][col]
output_parts.append(f" {col}:")
output_parts.append(f" Unique Values: {cats['unique_values']}")
if cats['top_values']:
output_parts.append(" Top Values: " +
", ".join(f"{k} ({v})" for k, v in
list(cats['top_values'].items())[:3]))
if cats['contains_currency']:
output_parts.append(" Contains Currency Values")
if cats['contains_dates']:
output_parts.append(" Contains Date Values")
# Add formula information
if summary['formulas']:
output_parts.append("\nFormulas Present:")
for cell, formula in list(summary['formulas'].items())[:5]: # Show first 5 formulas
output_parts.append(f" {cell}: {formula}")
# Sample data with improved formatting
output_parts.append("\nSample Data:")
sample_data = df.head(5).fillna("").to_string(index=False)
output_parts.append(sample_data)
output_parts.append("\n")
# Sheet relationships
if self.relationships:
output_parts.append("Sheet Relationships:")
for rel_key, rel_info in self.relationships.items():
if rel_info['type'] == 'potential_join':
sheets = rel_key.split('__')
output_parts.append(f"- {sheets[0]} and {sheets[1]} share columns: " +
f"{', '.join(rel_info['common_columns'])}")
elif rel_info['type'] == 'foreign_key':
parts = rel_key.split('__')
output_parts.append(f"- Potential foreign key relationship between " +
f"{parts[0]}.{parts[2]} and {parts[1]}.{parts[3]}")
return "\n".join(output_parts)
def get_sheet_summary(self, sheet_name: str) -> Optional[Dict]:
"""Get summary for a specific sheet"""
return self.sheet_summaries.get(sheet_name)
def get_relationships(self) -> Dict:
"""Get detected relationships between sheets"""
return self.relationships
def get_metadata(self) -> Dict:
"""Get complete metadata for all sheets"""
return self.sheet_metadata