Spaces:

TalatMasud
/

chatbot-backend

Running

App Files Files Community

chatbot-backend / src /utils /document_processor.py

TalatMasood

Enhanced the support for the excel file and added endpoint to have optimized vector store and Rag for the Excel.

b953016 5 months ago

raw

history blame

13.7 kB

	# src/utils/document_processor.py
	from typing import List, Dict, Optional, Union
	import PyPDF2
	import docx
	import pandas as pd
	import json
	from pathlib import Path
	import hashlib
	import magic # python-magic library for file type detection
	from bs4 import BeautifulSoup
	import csv
	from datetime import datetime
	import threading
	from queue import Queue
	import tiktoken
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	import logging
	from bs4.element import ProcessingInstruction
	from .enhanced_excel_processor import EnhancedExcelProcessor

	class DocumentProcessor:
	def __init__(
	self,
	chunk_size: int = 1000,
	chunk_overlap: int = 200,
	max_file_size: int = 10 * 1024 * 1024, # 10MB
	supported_formats: Optional[List[str]] = None
	):
	self.chunk_size = chunk_size
	self.chunk_overlap = chunk_overlap
	self.max_file_size = max_file_size
	self.supported_formats = supported_formats or [
	'.txt', '.pdf', '.docx', '.csv', '.json',
	'.html', '.md', '.xml', '.rtf', '.xlsx', '.xls'
	]
	self.processing_queue = Queue()
	self.processed_docs = {}
	self._initialize_text_splitter()

	# Initialize Excel processor
	self.excel_processor = EnhancedExcelProcessor()

	# Check for required packages
	try:
	import striprtf.striprtf
	except ImportError:
	logging.warning("Warning: striprtf package not found. RTF support will be limited.")

	try:
	from bs4 import BeautifulSoup
	import lxml
	except ImportError:
	logging.warning("Warning: beautifulsoup4 or lxml package not found. XML support will be limited.")

	def _initialize_text_splitter(self):
	"""Initialize the text splitter with custom settings"""
	self.text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=self.chunk_size,
	chunk_overlap=self.chunk_overlap,
	length_function=len,
	separators=["\n\n", "\n", " ", ""]
	)

	def _extract_content(self, file_path: Path) -> str:
	"""Extract content from different file formats"""
	suffix = file_path.suffix.lower()

	try:
	if suffix == '.pdf':
	return self._extract_pdf(file_path)
	elif suffix == '.docx':
	return self._extract_docx(file_path)
	elif suffix == '.csv':
	return self._extract_csv(file_path)
	elif suffix == '.json':
	return self._extract_json(file_path)
	elif suffix == '.html':
	return self._extract_html(file_path)
	elif suffix == '.txt' or suffix == '.md':
	return self._extract_text(file_path)
	elif suffix == '.xml':
	return self._extract_xml(file_path)
	elif suffix == '.rtf':
	return self._extract_rtf(file_path)
	elif suffix in ['.xlsx', '.xls']:
	return self._extract_excel(file_path)
	else:
	raise ValueError(f"Unsupported format: {suffix}")
	except Exception as e:
	raise Exception(f"Error extracting content from {file_path}: {str(e)}")

	def _extract_text(self, file_path: Path) -> str:
	"""Extract content from text-based files"""
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	return f.read()
	except UnicodeDecodeError:
	with open(file_path, 'r', encoding='latin-1') as f:
	return f.read()

	def _extract_pdf(self, file_path: Path) -> str:
	"""Extract text from PDF with advanced features"""
	text = ""
	with open(file_path, 'rb') as file:
	reader = PyPDF2.PdfReader(file)
	metadata = reader.metadata

	for page in reader.pages:
	text += page.extract_text() + "\n\n"

	# Extract images if available
	if '/XObject' in page['/Resources']:
	for obj in page['/Resources']['/XObject'].get_object():
	if page['/Resources']['/XObject'][obj]['/Subtype'] == '/Image':
	pass

	return text.strip()

	def _extract_docx(self, file_path: Path) -> str:
	"""Extract text from DOCX with formatting"""
	doc = docx.Document(file_path)
	full_text = []

	for para in doc.paragraphs:
	full_text.append(para.text)

	for table in doc.tables:
	for row in table.rows:
	row_text = [cell.text for cell in row.cells]
	full_text.append(" \| ".join(row_text))

	return "\n\n".join(full_text)

	def _extract_csv(self, file_path: Path) -> str:
	"""Convert CSV to structured text"""
	df = pd.read_csv(file_path)
	return df.to_string()

	def _extract_json(self, file_path: Path) -> str:
	"""Convert JSON to readable text"""
	with open(file_path) as f:
	data = json.load(f)
	return json.dumps(data, indent=2)

	def _extract_html(self, file_path: Path) -> str:
	"""Extract text from HTML with structure preservation"""
	with open(file_path) as f:
	soup = BeautifulSoup(f, 'html.parser')

	for script in soup(["script", "style"]):
	script.decompose()

	text = soup.get_text(separator='\n')
	lines = [line.strip() for line in text.splitlines() if line.strip()]
	return "\n\n".join(lines)

	def _extract_xml(self, file_path: Path) -> str:
	"""Extract text from XML with structure preservation"""
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	soup = BeautifulSoup(f, 'xml')

	for pi in soup.find_all(text=lambda text: isinstance(text, ProcessingInstruction)):
	pi.extract()

	text = soup.get_text(separator='\n')
	lines = [line.strip() for line in text.splitlines() if line.strip()]
	return "\n\n".join(lines)
	except Exception as e:
	raise Exception(f"Error processing XML file: {str(e)}")

	def _extract_rtf(self, file_path: Path) -> str:
	"""Extract text from RTF files"""
	try:
	import striprtf.striprtf as striprtf

	with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
	rtf_text = f.read()

	plain_text = striprtf.rtf_to_text(rtf_text)
	lines = [line.strip() for line in plain_text.splitlines() if line.strip()]
	return "\n\n".join(lines)
	except ImportError:
	raise ImportError("striprtf package is required for RTF support.")
	except Exception as e:
	raise Exception(f"Error processing RTF file: {str(e)}")

	def _extract_excel(self, file_path: Path) -> str:
	"""Extract content from Excel files with enhanced processing"""
	try:
	# Use enhanced Excel processor
	processed_content = self.excel_processor.process_excel(file_path)

	# If processing fails, fall back to basic processing
	if not processed_content:
	logging.warning(f"Enhanced Excel processing failed for {file_path}, falling back to basic processing")
	return self._basic_excel_extract(file_path)

	return processed_content

	except Exception as e:
	logging.error(f"Error in enhanced Excel processing: {str(e)}")
	# Fall back to basic Excel processing
	return self._basic_excel_extract(file_path)

	def _basic_excel_extract(self, file_path: Path) -> str:
	"""Basic Excel extraction as fallback"""
	try:
	excel_file = pd.ExcelFile(file_path)
	sheets_data = []

	for sheet_name in excel_file.sheet_names:
	df = pd.read_excel(excel_file, sheet_name=sheet_name)
	sheet_content = f"\nSheet: {sheet_name}\n"
	sheet_content += "=" * (len(sheet_name) + 7) + "\n"

	if df.empty:
	sheet_content += "Empty Sheet\n"
	else:
	sheet_content += df.fillna('').to_string(
	index=False,
	max_rows=None,
	max_cols=None,
	line_width=120
	) + "\n"

	sheets_data.append(sheet_content)

	return "\n\n".join(sheets_data)

	except Exception as e:
	raise Exception(f"Error in basic Excel processing: {str(e)}")

	def _generate_metadata(
	self,
	file_path: Path,
	content: str,
	additional_metadata: Optional[Dict] = None
	) -> Dict:
	"""Generate comprehensive metadata"""
	file_stat = file_path.stat()

	metadata = {
	'filename': file_path.name,
	'file_type': file_path.suffix,
	'file_size': file_stat.st_size,
	'created_at': datetime.fromtimestamp(file_stat.st_ctime),
	'modified_at': datetime.fromtimestamp(file_stat.st_mtime),
	'content_hash': self._calculate_hash(content),
	'mime_type': magic.from_file(str(file_path), mime=True),
	'word_count': len(content.split()),
	'character_count': len(content),
	'processing_timestamp': datetime.now().isoformat()
	}

	# Add Excel-specific metadata if applicable
	if file_path.suffix.lower() in ['.xlsx', '.xls']:
	try:
	if hasattr(self.excel_processor, 'get_metadata'):
	excel_metadata = self.excel_processor.get_metadata()
	metadata.update({'excel_metadata': excel_metadata})
	except Exception as e:
	logging.warning(f"Could not extract Excel metadata: {str(e)}")

	if additional_metadata:
	metadata.update(additional_metadata)

	return metadata

	def _calculate_hash(self, text: str) -> str:
	"""Calculate SHA-256 hash of text"""
	return hashlib.sha256(text.encode()).hexdigest()

	async def process_document(
	self,
	file_path: Union[str, Path],
	metadata: Optional[Dict] = None
	) -> Dict:
	"""Process a document with metadata and content extraction"""
	file_path = Path(file_path)

	if not self._validate_file(file_path):
	raise ValueError(f"Invalid file: {file_path}")

	content = self._extract_content(file_path)
	doc_metadata = self._generate_metadata(file_path, content, metadata)
	chunks = self.text_splitter.split_text(content)
	chunk_hashes = [self._calculate_hash(chunk) for chunk in chunks]

	return {
	'content': content,
	'chunks': chunks,
	'chunk_hashes': chunk_hashes,
	'metadata': doc_metadata,
	'statistics': self._generate_statistics(content, chunks)
	}

	def _validate_file(self, file_path: Path) -> bool:
	"""Validate file type, size, and content"""
	if not file_path.exists():
	raise FileNotFoundError(f"File not found: {file_path}")

	if file_path.suffix.lower() not in self.supported_formats:
	raise ValueError(f"Unsupported file format: {file_path.suffix}")

	if file_path.stat().st_size > self.max_file_size:
	raise ValueError(f"File too large: {file_path}")

	if file_path.stat().st_size == 0:
	raise ValueError(f"Empty file: {file_path}")

	return True

	def _generate_statistics(self, content: str, chunks: List[str]) -> Dict:
	"""Generate document statistics"""
	return {
	'total_chunks': len(chunks),
	'average_chunk_size': sum(len(chunk) for chunk in chunks) / len(chunks),
	'token_estimate': len(content.split()),
	'unique_words': len(set(content.lower().split())),
	'sentences': len([s for s in content.split('.') if s.strip()]),
	}

	async def batch_process(
	self,
	file_paths: List[Union[str, Path]],
	parallel: bool = True
	) -> Dict[str, Dict]:
	"""Process multiple documents in parallel"""
	results = {}

	if parallel:
	threads = []
	for file_path in file_paths:
	thread = threading.Thread(
	target=self._process_and_store,
	args=(file_path, results)
	)
	threads.append(thread)
	thread.start()

	for thread in threads:
	thread.join()
	else:
	for file_path in file_paths:
	await self._process_and_store(file_path, results)

	return results

	async def _process_and_store(
	self,
	file_path: Union[str, Path],
	results: Dict
	):
	"""Process a single document and store results"""
	try:
	result = await self.process_document(file_path)
	results[str(file_path)] = result
	except Exception as e:
	results[str(file_path)] = {'error': str(e)}