Spaces:

TalatMasud
/

chatbot-backend

Running

App Files Files Community

chatbot-backend / src /utils /excel_integration

TalatMasood

Enhanced the support for the excel file and added endpoint to have optimized vector store and Rag for the Excel.

b953016 5 months ago

raw

history blame

4.72 kB

	from typing import Dict, Any
	from pathlib import Path

	class ExcelIntegration:
	def __init__(self, enhanced_processor):
	"""
	Initialize Excel integration

	Args:
	enhanced_processor: Instance of EnhancedExcelProcessor
	"""
	self.processor = enhanced_processor

	def process_for_rag(self, file_path: Path) -> Dict[str, Any]:
	"""
	Process Excel file for RAG system

	Args:
	file_path (Path): Path to Excel file

	Returns:
	Dict[str, Any]: Processed content and metadata
	"""
	# Process Excel file
	content = self.processor.process_excel(file_path)

	# Get all metadata
	metadata = {
	'sheet_summaries': self.processor.sheet_summaries,
	'relationships': self.processor.relationships,
	'sheet_metadata': self.processor.sheet_metadata
	}

	# Create chunks based on logical divisions
	chunks = self._create_semantic_chunks(content)

	return {
	'content': content,
	'chunks': chunks,
	'metadata': metadata
	}

	def _create_semantic_chunks(self, content: str) -> list:
	"""
	Create meaningful chunks from Excel content

	Args:
	content (str): Processed Excel content

	Returns:
	list: List of content chunks
	"""
	chunks = []
	current_chunk = []
	current_sheet = None

	for line in content.split('\n'):
	# Start new chunk for each sheet
	if line.startswith('Sheet: '):
	if current_chunk:
	chunks.append('\n'.join(current_chunk))
	current_chunk = []
	current_sheet = line
	current_chunk.append(line)

	# Start new chunk for major sections within sheet
	elif any(line.startswith(section) for section in
	['Numeric Columns Summary:', 'Categorical Columns Summary:',
	'Sample Data:', 'Sheet Relationships:']):
	if current_chunk:
	chunks.append('\n'.join(current_chunk))
	current_chunk = []
	if current_sheet:
	current_chunk.append(current_sheet)
	current_chunk.append(line)

	else:
	current_chunk.append(line)

	# Add final chunk
	if current_chunk:
	chunks.append('\n'.join(current_chunk))

	return chunks

	def get_sheet_context(self, sheet_name: str) -> str:
	"""
	Get specific context for a sheet

	Args:
	sheet_name (str): Name of the sheet

	Returns:
	str: Contextual information about the sheet
	"""
	if sheet_name not in self.processor.sheet_metadata:
	return ""

	metadata = self.processor.sheet_metadata[sheet_name]
	summary = self.processor.sheet_summaries[sheet_name]

	context_parts = [
	f"Sheet: {sheet_name}",
	f"Total Rows: {metadata['rows']}",
	f"Columns: {', '.join(metadata['columns'])}",
	]

	# Add numeric column summaries
	if metadata['numeric_columns']:
	context_parts.append("\nNumeric Columns:")
	for col in metadata['numeric_columns']:
	stats = summary['numeric_summaries'][col]
	context_parts.append(f"- {col}: Range {stats['min']} to {stats['max']}, "
	f"Average {stats['mean']:.2f}")

	# Add categorical column summaries
	if metadata['categorical_columns']:
	context_parts.append("\nCategorical Columns:")
	for col in metadata['categorical_columns']:
	cats = summary['categorical_summaries'][col]
	context_parts.append(f"- {col}: {cats['unique_values']} unique values")

	return "\n".join(context_parts)

	def get_relationship_context(self) -> str:
	"""
	Get context about relationships between sheets

	Returns:
	str: Information about sheet relationships
	"""
	if not self.processor.relationships:
	return "No relationships detected between sheets."

	context_parts = ["Sheet Relationships:"]

	for rel_key, rel_info in self.processor.relationships.items():
	if rel_info['type'] == 'potential_join':
	sheets = rel_