Spaces:
Running
Running
Enhanced the support for the excel file and added endpoint to have optimized vector store and Rag for the Excel.
b953016
from typing import Dict, List, Any, Optional | |
import pandas as pd | |
import numpy as np | |
from pathlib import Path | |
import json | |
class EnhancedExcelProcessor: | |
def __init__(self): | |
"""Initialize the enhanced Excel processor""" | |
self.sheet_summaries = {} | |
self.relationships = {} | |
self.sheet_metadata = {} | |
def process_excel(self, file_path: Path) -> str: | |
""" | |
Process Excel file with enhanced multi-sheet handling | |
Args: | |
file_path (Path): Path to Excel file | |
Returns: | |
str: Structured text representation of Excel content | |
""" | |
# Read all sheets | |
excel_file = pd.ExcelFile(file_path) | |
sheets_data = {} | |
for sheet_name in excel_file.sheet_names: | |
df = pd.read_excel(excel_file, sheet_name=sheet_name) | |
sheets_data[sheet_name] = df | |
# Generate sheet summary | |
self.sheet_summaries[sheet_name] = self._generate_sheet_summary(df) | |
# Extract sheet metadata | |
self.sheet_metadata[sheet_name] = { | |
'columns': list(df.columns), | |
'rows': len(df), | |
'numeric_columns': df.select_dtypes(include=[np.number]).columns.tolist(), | |
'date_columns': df.select_dtypes(include=['datetime64']).columns.tolist(), | |
'categorical_columns': df.select_dtypes(include=['object']).columns.tolist() | |
} | |
# Detect relationships between sheets | |
self.relationships = self._detect_relationships(sheets_data) | |
# Generate structured text representation | |
return self._generate_structured_text(sheets_data) | |
def _generate_sheet_summary(self, df: pd.DataFrame) -> Dict: | |
"""Generate statistical summary for a sheet""" | |
summary = { | |
'total_rows': len(df), | |
'total_columns': len(df.columns), | |
'column_types': {}, | |
'numeric_summaries': {}, | |
'categorical_summaries': {}, | |
'null_counts': df.isnull().sum().to_dict() | |
} | |
# Process numeric columns | |
numeric_cols = df.select_dtypes(include=[np.number]).columns | |
for col in numeric_cols: | |
summary['numeric_summaries'][col] = { | |
'mean': float(df[col].mean()), | |
'median': float(df[col].median()), | |
'std': float(df[col].std()), | |
'min': float(df[col].min()), | |
'max': float(df[col].max()) | |
} | |
summary['column_types'][col] = 'numeric' | |
# Process categorical columns | |
categorical_cols = df.select_dtypes(include=['object']).columns | |
for col in categorical_cols: | |
value_counts = df[col].value_counts() | |
summary['categorical_summaries'][col] = { | |
'unique_values': int(len(value_counts)), | |
'top_values': value_counts.head(5).to_dict() | |
} | |
summary['column_types'][col] = 'categorical' | |
return summary | |
def _detect_relationships(self, sheets_data: Dict[str, pd.DataFrame]) -> Dict: | |
"""Detect potential relationships between sheets""" | |
relationships = {} | |
sheet_names = list(sheets_data.keys()) | |
for i, sheet1 in enumerate(sheet_names): | |
for sheet2 in sheet_names[i+1:]: | |
common_cols = set(sheets_data[sheet1].columns) & set(sheets_data[sheet2].columns) | |
if common_cols: | |
relationships[f"{sheet1}__{sheet2}"] = { | |
'common_columns': list(common_cols), | |
'type': 'potential_join' | |
} | |
# Check for foreign key relationships | |
for col1 in sheets_data[sheet1].columns: | |
for col2 in sheets_data[sheet2].columns: | |
if (col1.lower().endswith('_id') or col2.lower().endswith('_id')): | |
unique_vals1 = set(sheets_data[sheet1][col1].dropna()) | |
unique_vals2 = set(sheets_data[sheet2][col2].dropna()) | |
if unique_vals1 & unique_vals2: | |
relationships[f"{sheet1}__{sheet2}__{col1}__{col2}"] = { | |
'type': 'foreign_key', | |
'columns': [col1, col2] | |
} | |
return relationships | |
def _generate_structured_text(self, sheets_data: Dict[str, pd.DataFrame]) -> str: | |
"""Generate structured text representation of Excel content""" | |
output_parts = [] | |
# Overall summary | |
output_parts.append(f"Excel File Overview:") | |
output_parts.append(f"Total Sheets: {len(sheets_data)}") | |
output_parts.append("") | |
# Sheet details | |
for sheet_name, df in sheets_data.items(): | |
output_parts.append(f"Sheet: {sheet_name}") | |
output_parts.append("=" * (len(sheet_name) + 7)) | |
metadata = self.sheet_metadata[sheet_name] | |
summary = self.sheet_summaries[sheet_name] | |
# Basic info | |
output_parts.append(f"Rows: {metadata['rows']}") | |
output_parts.append(f"Columns: {', '.join(metadata['columns'])}") | |
output_parts.append("") | |
# Column summaries | |
if metadata['numeric_columns']: | |
output_parts.append("Numeric Columns Summary:") | |
for col in metadata['numeric_columns']: | |
stats = summary['numeric_summaries'][col] | |
output_parts.append(f" {col}:") | |
output_parts.append(f" Range: {stats['min']} to {stats['max']}") | |
output_parts.append(f" Average: {stats['mean']:.2f}") | |
output_parts.append("") | |
if metadata['categorical_columns']: | |
output_parts.append("Categorical Columns Summary:") | |
for col in metadata['categorical_columns']: | |
cats = summary['categorical_summaries'][col] | |
output_parts.append(f" {col}:") | |
output_parts.append(f" Unique Values: {cats['unique_values']}") | |
if cats['top_values']: | |
output_parts.append(" Top Values: " + | |
", ".join(f"{k} ({v})" for k, v in | |
list(cats['top_values'].items())[:3])) | |
output_parts.append("") | |
# Sample data | |
output_parts.append("Sample Data:") | |
output_parts.append(df.head(3).to_string()) | |
output_parts.append("\n") | |
# Relationships | |
if self.relationships: | |
output_parts.append("Sheet Relationships:") | |
for rel_key, rel_info in self.relationships.items(): | |
if rel_info['type'] == 'potential_join': | |
sheets = rel_key.split('__') | |
output_parts.append(f"- {sheets[0]} and {sheets[1]} share columns: " + | |
f"{', '.join(rel_info['common_columns'])}") | |
elif rel_info['type'] == 'foreign_key': | |
parts = rel_key.split('__') | |
output_parts.append(f"- Potential foreign key relationship between " + | |
f"{parts[0]}.{parts[2]} and {parts[1]}.{parts[3]}") | |
return "\n".join(output_parts) | |
def get_sheet_summary(self, sheet_name: str) -> Optional[Dict]: | |
"""Get summary for a specific sheet""" | |
return self.sheet_summaries.get(sheet_name) | |
def get_relationships(self) -> Dict: | |
"""Get detected relationships between sheets""" | |
return self.relationships | |
def get_metadata(self) -> Dict: | |
"""Get complete metadata for all sheets""" | |
return self.sheet_metadata |