chatbot-backend / src /utils /enhanced_excel_processor.py
TalatMasood's picture
Changes to be committed:
be32fd8
from typing import Dict, List, Any, Optional
import pandas as pd
import numpy as np
from pathlib import Path
import logging
from openpyxl import load_workbook
from openpyxl.utils.cell import get_column_letter
class EnhancedExcelProcessor:
def __init__(self):
"""Initialize the enhanced Excel processor"""
self.sheet_summaries = {}
self.relationships = {}
self.sheet_metadata = {}
def process_excel(self, file_path: Path) -> str:
"""
Process Excel file with enhanced data extraction
Args:
file_path (Path): Path to Excel file
Returns:
str: Structured text representation of Excel content
"""
# Read all sheets with improved handling
excel_file = pd.ExcelFile(file_path)
sheets_data = {}
# Load workbook for additional metadata
workbook = load_workbook(file_path, data_only=True)
for sheet_name in excel_file.sheet_names:
# Read with pandas for data structure
df = pd.read_excel(
excel_file,
sheet_name=sheet_name,
header=None # Read without assuming header to capture all data
)
# Clean column names
if df.iloc[0].notna().any(): # If first row has any data
df.columns = [f"Column_{i}" if pd.isna(x) else str(x).strip()
for i, x in enumerate(df.iloc[0])]
df = df.iloc[1:] # Remove header row from data
sheets_data[sheet_name] = df
# Generate enhanced sheet summary
self.sheet_summaries[sheet_name] = self._generate_enhanced_sheet_summary(
df,
workbook[sheet_name]
)
# Extract enhanced sheet metadata
self.sheet_metadata[sheet_name] = self._extract_enhanced_metadata(
df,
workbook[sheet_name]
)
# Detect relationships between sheets
self.relationships = self._detect_relationships(sheets_data)
# Generate structured text representation
return self._generate_enhanced_structured_text(sheets_data, workbook)
def _generate_enhanced_sheet_summary(self, df: pd.DataFrame, ws) -> Dict:
"""Generate comprehensive statistical summary for a sheet"""
summary = {
'total_rows': len(df),
'total_columns': len(df.columns),
'column_types': {},
'numeric_summaries': {},
'categorical_summaries': {},
'null_counts': df.isnull().sum().to_dict(),
'merged_cells': self._get_merged_cells_info(ws),
'formulas': self._get_formulas_info(ws)
}
# Process numeric columns with enhanced detection
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
col_data = pd.to_numeric(df[col], errors='coerce')
summary['numeric_summaries'][col] = {
'mean': float(col_data.mean()) if not col_data.empty else None,
'median': float(col_data.median()) if not col_data.empty else None,
'std': float(col_data.std()) if not col_data.empty else None,
'min': float(col_data.min()) if not col_data.empty else None,
'max': float(col_data.max()) if not col_data.empty else None,
'sum': float(col_data.sum()) if not col_data.empty else None
}
summary['column_types'][col] = 'numeric'
# Process categorical and text columns with enhanced analysis
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
# Clean and process values
values = df[col].astype(str).replace('nan', pd.NA).dropna()
if not values.empty:
value_counts = values.value_counts()
summary['categorical_summaries'][col] = {
'unique_values': int(len(value_counts)),
'top_values': value_counts.head(5).to_dict(),
'contains_currency': self._detect_currency(values),
'contains_dates': self._detect_dates(values)
}
summary['column_types'][col] = 'categorical'
return summary
def _extract_enhanced_metadata(self, df: pd.DataFrame, ws) -> Dict:
"""Extract comprehensive metadata including Excel-specific features"""
metadata = {
'columns': list(df.columns),
'rows': len(df),
'numeric_columns': df.select_dtypes(include=[np.number]).columns.tolist(),
'date_columns': df.select_dtypes(include=['datetime64']).columns.tolist(),
'categorical_columns': df.select_dtypes(include=['object']).columns.tolist(),
'column_widths': {get_column_letter(i+1): ws.column_dimensions[get_column_letter(i+1)].width
for i in range(len(df.columns))
if get_column_letter(i+1) in ws.column_dimensions},
'hidden_rows': [idx for idx in range(1, ws.max_row + 1) if ws.row_dimensions[idx].hidden],
'hidden_columns': [get_column_letter(idx) for idx in range(1, ws.max_column + 1)
if ws.column_dimensions[get_column_letter(idx)].hidden],
'has_charts': bool(ws._charts),
'has_images': bool(ws._images),
'frozen_panes': ws.freeze_panes is not None
}
return metadata
def _get_merged_cells_info(self, ws) -> List[Dict]:
"""Extract information about merged cells"""
merged_cells = []
for merged_range in ws.merged_cells.ranges:
merged_cells.append({
'range': str(merged_range),
'start_cell': merged_range.start_cell.coordinate,
'end_cell': merged_range.end_cell.coordinate
})
return merged_cells
def _get_formulas_info(self, ws) -> Dict[str, str]:
"""Extract formulas from the worksheet"""
formulas = {}
for row in ws.iter_rows():
for cell in row:
if cell.formula:
formulas[cell.coordinate] = cell.formula
return formulas
def _detect_currency(self, series: pd.Series) -> bool:
"""Detect if a series contains currency values"""
currency_patterns = ['$', '€', '£', '¥']
return any(series.astype(str).str.contains('|'.join(currency_patterns)).any())
def _detect_dates(self, series: pd.Series) -> bool:
"""Detect if a series contains date values"""
try:
pd.to_datetime(series, errors='raise')
return True
except:
return False
def _generate_enhanced_structured_text(self, sheets_data: Dict[str, pd.DataFrame], workbook) -> str:
"""Generate detailed structured text representation of Excel content"""
output_parts = []
# Overall summary
output_parts.append(f"Excel File Overview:")
output_parts.append(f"Total Sheets: {len(sheets_data)}")
output_parts.append("")
# Sheet details
for sheet_name, df in sheets_data.items():
output_parts.append(f"Sheet: {sheet_name}")
output_parts.append("=" * (len(sheet_name) + 7))
metadata = self.sheet_metadata[sheet_name]
summary = self.sheet_summaries[sheet_name]
# Basic info
output_parts.append(f"Rows: {metadata['rows']}")
output_parts.append(f"Columns: {', '.join(metadata['columns'])}")
# Add information about hidden elements
if metadata['hidden_rows']:
output_parts.append(f"Hidden Rows: {len(metadata['hidden_rows'])}")
if metadata['hidden_columns']:
output_parts.append(f"Hidden Columns: {len(metadata['hidden_columns'])}")
# Add information about merged cells
if summary['merged_cells']:
output_parts.append("\nMerged Cells:")
for merge_info in summary['merged_cells'][:5]: # Show first 5 merged ranges
output_parts.append(f" - Range: {merge_info['range']}")
# Numeric columns summary
if metadata['numeric_columns']:
output_parts.append("\nNumeric Columns Summary:")
for col in metadata['numeric_columns']:
stats = summary['numeric_summaries'][col]
output_parts.append(f" {col}:")
output_parts.append(f" Range: {stats['min']} to {stats['max']}")
output_parts.append(f" Average: {stats['mean']:.2f}")
output_parts.append(f" Sum: {stats['sum']:.2f}")
# Categorical columns summary
if metadata['categorical_columns']:
output_parts.append("\nCategorical Columns Summary:")
for col in metadata['categorical_columns']:
if col in summary['categorical_summaries']:
cats = summary['categorical_summaries'][col]
output_parts.append(f" {col}:")
output_parts.append(f" Unique Values: {cats['unique_values']}")
if cats['top_values']:
output_parts.append(" Top Values: " +
", ".join(f"{k} ({v})" for k, v in
list(cats['top_values'].items())[:3]))
if cats['contains_currency']:
output_parts.append(" Contains Currency Values")
if cats['contains_dates']:
output_parts.append(" Contains Date Values")
# Add formula information
if summary['formulas']:
output_parts.append("\nFormulas Present:")
for cell, formula in list(summary['formulas'].items())[:5]: # Show first 5 formulas
output_parts.append(f" {cell}: {formula}")
# Sample data with improved formatting
output_parts.append("\nSample Data:")
sample_data = df.head(5).fillna("").to_string(index=False)
output_parts.append(sample_data)
output_parts.append("\n")
# Sheet relationships
if self.relationships:
output_parts.append("Sheet Relationships:")
for rel_key, rel_info in self.relationships.items():
if rel_info['type'] == 'potential_join':
sheets = rel_key.split('__')
output_parts.append(f"- {sheets[0]} and {sheets[1]} share columns: " +
f"{', '.join(rel_info['common_columns'])}")
elif rel_info['type'] == 'foreign_key':
parts = rel_key.split('__')
output_parts.append(f"- Potential foreign key relationship between " +
f"{parts[0]}.{parts[2]} and {parts[1]}.{parts[3]}")
return "\n".join(output_parts)
def get_sheet_summary(self, sheet_name: str) -> Optional[Dict]:
"""Get summary for a specific sheet"""
return self.sheet_summaries.get(sheet_name)
def get_relationships(self) -> Dict:
"""Get detected relationships between sheets"""
return self.relationships
def get_metadata(self) -> Dict:
"""Get complete metadata for all sheets"""
return self.sheet_metadata