from typing import Dict, Any from pathlib import Path class ExcelIntegration: def __init__(self, enhanced_processor): """ Initialize Excel integration Args: enhanced_processor: Instance of EnhancedExcelProcessor """ self.processor = enhanced_processor def process_for_rag(self, file_path: Path) -> Dict[str, Any]: """ Process Excel file for RAG system Args: file_path (Path): Path to Excel file Returns: Dict[str, Any]: Processed content and metadata """ # Process Excel file content = self.processor.process_excel(file_path) # Get all metadata metadata = { 'sheet_summaries': self.processor.sheet_summaries, 'relationships': self.processor.relationships, 'sheet_metadata': self.processor.sheet_metadata } # Create chunks based on logical divisions chunks = self._create_semantic_chunks(content) return { 'content': content, 'chunks': chunks, 'metadata': metadata } def _create_semantic_chunks(self, content: str) -> list: """ Create meaningful chunks from Excel content Args: content (str): Processed Excel content Returns: list: List of content chunks """ chunks = [] current_chunk = [] current_sheet = None for line in content.split('\n'): # Start new chunk for each sheet if line.startswith('Sheet: '): if current_chunk: chunks.append('\n'.join(current_chunk)) current_chunk = [] current_sheet = line current_chunk.append(line) # Start new chunk for major sections within sheet elif any(line.startswith(section) for section in ['Numeric Columns Summary:', 'Categorical Columns Summary:', 'Sample Data:', 'Sheet Relationships:']): if current_chunk: chunks.append('\n'.join(current_chunk)) current_chunk = [] if current_sheet: current_chunk.append(current_sheet) current_chunk.append(line) else: current_chunk.append(line) # Add final chunk if current_chunk: chunks.append('\n'.join(current_chunk)) return chunks def get_sheet_context(self, sheet_name: str) -> str: """ Get specific context for a sheet Args: sheet_name (str): Name of the sheet Returns: str: Contextual information about the sheet """ if sheet_name not in self.processor.sheet_metadata: return "" metadata = self.processor.sheet_metadata[sheet_name] summary = self.processor.sheet_summaries[sheet_name] context_parts = [ f"Sheet: {sheet_name}", f"Total Rows: {metadata['rows']}", f"Columns: {', '.join(metadata['columns'])}", ] # Add numeric column summaries if metadata['numeric_columns']: context_parts.append("\nNumeric Columns:") for col in metadata['numeric_columns']: stats = summary['numeric_summaries'][col] context_parts.append(f"- {col}: Range {stats['min']} to {stats['max']}, " f"Average {stats['mean']:.2f}") # Add categorical column summaries if metadata['categorical_columns']: context_parts.append("\nCategorical Columns:") for col in metadata['categorical_columns']: cats = summary['categorical_summaries'][col] context_parts.append(f"- {col}: {cats['unique_values']} unique values") return "\n".join(context_parts) def get_relationship_context(self) -> str: """ Get context about relationships between sheets Returns: str: Information about sheet relationships """ if not self.processor.relationships: return "No relationships detected between sheets." context_parts = ["Sheet Relationships:"] for rel_key, rel_info in self.processor.relationships.items(): if rel_info['type'] == 'potential_join': sheets = rel_