Spaces:
Running
Running
File size: 4,723 Bytes
b953016 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
from typing import Dict, Any
from pathlib import Path
class ExcelIntegration:
def __init__(self, enhanced_processor):
"""
Initialize Excel integration
Args:
enhanced_processor: Instance of EnhancedExcelProcessor
"""
self.processor = enhanced_processor
def process_for_rag(self, file_path: Path) -> Dict[str, Any]:
"""
Process Excel file for RAG system
Args:
file_path (Path): Path to Excel file
Returns:
Dict[str, Any]: Processed content and metadata
"""
# Process Excel file
content = self.processor.process_excel(file_path)
# Get all metadata
metadata = {
'sheet_summaries': self.processor.sheet_summaries,
'relationships': self.processor.relationships,
'sheet_metadata': self.processor.sheet_metadata
}
# Create chunks based on logical divisions
chunks = self._create_semantic_chunks(content)
return {
'content': content,
'chunks': chunks,
'metadata': metadata
}
def _create_semantic_chunks(self, content: str) -> list:
"""
Create meaningful chunks from Excel content
Args:
content (str): Processed Excel content
Returns:
list: List of content chunks
"""
chunks = []
current_chunk = []
current_sheet = None
for line in content.split('\n'):
# Start new chunk for each sheet
if line.startswith('Sheet: '):
if current_chunk:
chunks.append('\n'.join(current_chunk))
current_chunk = []
current_sheet = line
current_chunk.append(line)
# Start new chunk for major sections within sheet
elif any(line.startswith(section) for section in
['Numeric Columns Summary:', 'Categorical Columns Summary:',
'Sample Data:', 'Sheet Relationships:']):
if current_chunk:
chunks.append('\n'.join(current_chunk))
current_chunk = []
if current_sheet:
current_chunk.append(current_sheet)
current_chunk.append(line)
else:
current_chunk.append(line)
# Add final chunk
if current_chunk:
chunks.append('\n'.join(current_chunk))
return chunks
def get_sheet_context(self, sheet_name: str) -> str:
"""
Get specific context for a sheet
Args:
sheet_name (str): Name of the sheet
Returns:
str: Contextual information about the sheet
"""
if sheet_name not in self.processor.sheet_metadata:
return ""
metadata = self.processor.sheet_metadata[sheet_name]
summary = self.processor.sheet_summaries[sheet_name]
context_parts = [
f"Sheet: {sheet_name}",
f"Total Rows: {metadata['rows']}",
f"Columns: {', '.join(metadata['columns'])}",
]
# Add numeric column summaries
if metadata['numeric_columns']:
context_parts.append("\nNumeric Columns:")
for col in metadata['numeric_columns']:
stats = summary['numeric_summaries'][col]
context_parts.append(f"- {col}: Range {stats['min']} to {stats['max']}, "
f"Average {stats['mean']:.2f}")
# Add categorical column summaries
if metadata['categorical_columns']:
context_parts.append("\nCategorical Columns:")
for col in metadata['categorical_columns']:
cats = summary['categorical_summaries'][col]
context_parts.append(f"- {col}: {cats['unique_values']} unique values")
return "\n".join(context_parts)
def get_relationship_context(self) -> str:
"""
Get context about relationships between sheets
Returns:
str: Information about sheet relationships
"""
if not self.processor.relationships:
return "No relationships detected between sheets."
context_parts = ["Sheet Relationships:"]
for rel_key, rel_info in self.processor.relationships.items():
if rel_info['type'] == 'potential_join':
sheets = rel_ |