chatbot-backend / src /utils /excel_integration
TalatMasood's picture
Enhanced the support for the excel file and added endpoint to have optimized vector store and Rag for the Excel.
b953016
raw
history blame
4.72 kB
from typing import Dict, Any
from pathlib import Path
class ExcelIntegration:
def __init__(self, enhanced_processor):
"""
Initialize Excel integration
Args:
enhanced_processor: Instance of EnhancedExcelProcessor
"""
self.processor = enhanced_processor
def process_for_rag(self, file_path: Path) -> Dict[str, Any]:
"""
Process Excel file for RAG system
Args:
file_path (Path): Path to Excel file
Returns:
Dict[str, Any]: Processed content and metadata
"""
# Process Excel file
content = self.processor.process_excel(file_path)
# Get all metadata
metadata = {
'sheet_summaries': self.processor.sheet_summaries,
'relationships': self.processor.relationships,
'sheet_metadata': self.processor.sheet_metadata
}
# Create chunks based on logical divisions
chunks = self._create_semantic_chunks(content)
return {
'content': content,
'chunks': chunks,
'metadata': metadata
}
def _create_semantic_chunks(self, content: str) -> list:
"""
Create meaningful chunks from Excel content
Args:
content (str): Processed Excel content
Returns:
list: List of content chunks
"""
chunks = []
current_chunk = []
current_sheet = None
for line in content.split('\n'):
# Start new chunk for each sheet
if line.startswith('Sheet: '):
if current_chunk:
chunks.append('\n'.join(current_chunk))
current_chunk = []
current_sheet = line
current_chunk.append(line)
# Start new chunk for major sections within sheet
elif any(line.startswith(section) for section in
['Numeric Columns Summary:', 'Categorical Columns Summary:',
'Sample Data:', 'Sheet Relationships:']):
if current_chunk:
chunks.append('\n'.join(current_chunk))
current_chunk = []
if current_sheet:
current_chunk.append(current_sheet)
current_chunk.append(line)
else:
current_chunk.append(line)
# Add final chunk
if current_chunk:
chunks.append('\n'.join(current_chunk))
return chunks
def get_sheet_context(self, sheet_name: str) -> str:
"""
Get specific context for a sheet
Args:
sheet_name (str): Name of the sheet
Returns:
str: Contextual information about the sheet
"""
if sheet_name not in self.processor.sheet_metadata:
return ""
metadata = self.processor.sheet_metadata[sheet_name]
summary = self.processor.sheet_summaries[sheet_name]
context_parts = [
f"Sheet: {sheet_name}",
f"Total Rows: {metadata['rows']}",
f"Columns: {', '.join(metadata['columns'])}",
]
# Add numeric column summaries
if metadata['numeric_columns']:
context_parts.append("\nNumeric Columns:")
for col in metadata['numeric_columns']:
stats = summary['numeric_summaries'][col]
context_parts.append(f"- {col}: Range {stats['min']} to {stats['max']}, "
f"Average {stats['mean']:.2f}")
# Add categorical column summaries
if metadata['categorical_columns']:
context_parts.append("\nCategorical Columns:")
for col in metadata['categorical_columns']:
cats = summary['categorical_summaries'][col]
context_parts.append(f"- {col}: {cats['unique_values']} unique values")
return "\n".join(context_parts)
def get_relationship_context(self) -> str:
"""
Get context about relationships between sheets
Returns:
str: Information about sheet relationships
"""
if not self.processor.relationships:
return "No relationships detected between sheets."
context_parts = ["Sheet Relationships:"]
for rel_key, rel_info in self.processor.relationships.items():
if rel_info['type'] == 'potential_join':
sheets = rel_