Spaces:
Running
Running
Enhanced the support for the excel file and added endpoint to have optimized vector store and Rag for the Excel.
b953016
from typing import Dict, Any | |
from pathlib import Path | |
class ExcelIntegration: | |
def __init__(self, enhanced_processor): | |
""" | |
Initialize Excel integration | |
Args: | |
enhanced_processor: Instance of EnhancedExcelProcessor | |
""" | |
self.processor = enhanced_processor | |
def process_for_rag(self, file_path: Path) -> Dict[str, Any]: | |
""" | |
Process Excel file for RAG system | |
Args: | |
file_path (Path): Path to Excel file | |
Returns: | |
Dict[str, Any]: Processed content and metadata | |
""" | |
# Process Excel file | |
content = self.processor.process_excel(file_path) | |
# Get all metadata | |
metadata = { | |
'sheet_summaries': self.processor.sheet_summaries, | |
'relationships': self.processor.relationships, | |
'sheet_metadata': self.processor.sheet_metadata | |
} | |
# Create chunks based on logical divisions | |
chunks = self._create_semantic_chunks(content) | |
return { | |
'content': content, | |
'chunks': chunks, | |
'metadata': metadata | |
} | |
def _create_semantic_chunks(self, content: str) -> list: | |
""" | |
Create meaningful chunks from Excel content | |
Args: | |
content (str): Processed Excel content | |
Returns: | |
list: List of content chunks | |
""" | |
chunks = [] | |
current_chunk = [] | |
current_sheet = None | |
for line in content.split('\n'): | |
# Start new chunk for each sheet | |
if line.startswith('Sheet: '): | |
if current_chunk: | |
chunks.append('\n'.join(current_chunk)) | |
current_chunk = [] | |
current_sheet = line | |
current_chunk.append(line) | |
# Start new chunk for major sections within sheet | |
elif any(line.startswith(section) for section in | |
['Numeric Columns Summary:', 'Categorical Columns Summary:', | |
'Sample Data:', 'Sheet Relationships:']): | |
if current_chunk: | |
chunks.append('\n'.join(current_chunk)) | |
current_chunk = [] | |
if current_sheet: | |
current_chunk.append(current_sheet) | |
current_chunk.append(line) | |
else: | |
current_chunk.append(line) | |
# Add final chunk | |
if current_chunk: | |
chunks.append('\n'.join(current_chunk)) | |
return chunks | |
def get_sheet_context(self, sheet_name: str) -> str: | |
""" | |
Get specific context for a sheet | |
Args: | |
sheet_name (str): Name of the sheet | |
Returns: | |
str: Contextual information about the sheet | |
""" | |
if sheet_name not in self.processor.sheet_metadata: | |
return "" | |
metadata = self.processor.sheet_metadata[sheet_name] | |
summary = self.processor.sheet_summaries[sheet_name] | |
context_parts = [ | |
f"Sheet: {sheet_name}", | |
f"Total Rows: {metadata['rows']}", | |
f"Columns: {', '.join(metadata['columns'])}", | |
] | |
# Add numeric column summaries | |
if metadata['numeric_columns']: | |
context_parts.append("\nNumeric Columns:") | |
for col in metadata['numeric_columns']: | |
stats = summary['numeric_summaries'][col] | |
context_parts.append(f"- {col}: Range {stats['min']} to {stats['max']}, " | |
f"Average {stats['mean']:.2f}") | |
# Add categorical column summaries | |
if metadata['categorical_columns']: | |
context_parts.append("\nCategorical Columns:") | |
for col in metadata['categorical_columns']: | |
cats = summary['categorical_summaries'][col] | |
context_parts.append(f"- {col}: {cats['unique_values']} unique values") | |
return "\n".join(context_parts) | |
def get_relationship_context(self) -> str: | |
""" | |
Get context about relationships between sheets | |
Returns: | |
str: Information about sheet relationships | |
""" | |
if not self.processor.relationships: | |
return "No relationships detected between sheets." | |
context_parts = ["Sheet Relationships:"] | |
for rel_key, rel_info in self.processor.relationships.items(): | |
if rel_info['type'] == 'potential_join': | |
sheets = rel_ |