Spaces:

TalatMasud
/

chatbot-backend

Running

File size: 4,723 Bytes

b953016

from typing import Dict, Any
from pathlib import Path

class ExcelIntegration:
    def __init__(self, enhanced_processor):
        """
        Initialize Excel integration
        
        Args:
            enhanced_processor: Instance of EnhancedExcelProcessor
        """
        self.processor = enhanced_processor
        
    def process_for_rag(self, file_path: Path) -> Dict[str, Any]:
        """
        Process Excel file for RAG system
        
        Args:
            file_path (Path): Path to Excel file
            
        Returns:
            Dict[str, Any]: Processed content and metadata
        """
        # Process Excel file
        content = self.processor.process_excel(file_path)
        
        # Get all metadata
        metadata = {
            'sheet_summaries': self.processor.sheet_summaries,
            'relationships': self.processor.relationships,
            'sheet_metadata': self.processor.sheet_metadata
        }
        
        # Create chunks based on logical divisions
        chunks = self._create_semantic_chunks(content)
        
        return {
            'content': content,
            'chunks': chunks,
            'metadata': metadata
        }
    
    def _create_semantic_chunks(self, content: str) -> list:
        """
        Create meaningful chunks from Excel content
        
        Args:
            content (str): Processed Excel content
            
        Returns:
            list: List of content chunks
        """
        chunks = []
        current_chunk = []
        current_sheet = None
        
        for line in content.split('\n'):
            # Start new chunk for each sheet
            if line.startswith('Sheet: '):
                if current_chunk:
                    chunks.append('\n'.join(current_chunk))
                    current_chunk = []
                current_sheet = line
                current_chunk.append(line)
            
            # Start new chunk for major sections within sheet
            elif any(line.startswith(section) for section in 
                    ['Numeric Columns Summary:', 'Categorical Columns Summary:', 
                     'Sample Data:', 'Sheet Relationships:']):
                if current_chunk:
                    chunks.append('\n'.join(current_chunk))
                    current_chunk = []
                if current_sheet:
                    current_chunk.append(current_sheet)
                current_chunk.append(line)
            
            else:
                current_chunk.append(line)
        
        # Add final chunk
        if current_chunk:
            chunks.append('\n'.join(current_chunk))
        
        return chunks

    def get_sheet_context(self, sheet_name: str) -> str:
        """
        Get specific context for a sheet
        
        Args:
            sheet_name (str): Name of the sheet
            
        Returns:
            str: Contextual information about the sheet
        """
        if sheet_name not in self.processor.sheet_metadata:
            return ""
            
        metadata = self.processor.sheet_metadata[sheet_name]
        summary = self.processor.sheet_summaries[sheet_name]
        
        context_parts = [
            f"Sheet: {sheet_name}",
            f"Total Rows: {metadata['rows']}",
            f"Columns: {', '.join(metadata['columns'])}",
        ]
        
        # Add numeric column summaries
        if metadata['numeric_columns']:
            context_parts.append("\nNumeric Columns:")
            for col in metadata['numeric_columns']:
                stats = summary['numeric_summaries'][col]
                context_parts.append(f"- {col}: Range {stats['min']} to {stats['max']}, "
                                  f"Average {stats['mean']:.2f}")
        
        # Add categorical column summaries
        if metadata['categorical_columns']:
            context_parts.append("\nCategorical Columns:")
            for col in metadata['categorical_columns']:
                cats = summary['categorical_summaries'][col]
                context_parts.append(f"- {col}: {cats['unique_values']} unique values")
        
        return "\n".join(context_parts)

    def get_relationship_context(self) -> str:
        """
        Get context about relationships between sheets
        
        Returns:
            str: Information about sheet relationships
        """
        if not self.processor.relationships:
            return "No relationships detected between sheets."
            
        context_parts = ["Sheet Relationships:"]
        
        for rel_key, rel_info in self.processor.relationships.items():
            if rel_info['type'] == 'potential_join':
                sheets = rel_