Spaces:

TalatMasud
/

chatbot-backend

Running

App Files Files Community

chatbot-backend / src /utils /document_processor.py

TalatMasood

Updated embedding to produciton, changed the depenedency for magic library

c2e0ca0 5 months ago

raw

history blame

29 kB

	# src/utils/document_processor.py
	from typing import List, Dict, Optional, Union
	import PyPDF2
	import docx
	import pandas as pd
	import json
	from pathlib import Path
	import hashlib
	import mimetypes # Add this instead
	from bs4 import BeautifulSoup
	import csv
	from datetime import datetime
	import threading
	from queue import Queue
	import tiktoken
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	import logging
	from bs4.element import ProcessingInstruction

	from config.config import Settings
	from .enhanced_excel_processor import EnhancedExcelProcessor


	class DocumentProcessor:
	def __init__(
	self,
	chunk_size: Optional[int] = None,
	chunk_overlap: Optional[int] = None,
	max_file_size: Optional[int] = None,
	supported_formats: Optional[List[str]] = None
	):
	"""
	Initialize DocumentProcessor with configurable parameters

	Args:
	chunk_size (Optional[int]): Size of text chunks
	chunk_overlap (Optional[int]): Overlap between chunks
	max_file_size (Optional[int]): Maximum file size in bytes
	supported_formats (Optional[List[str]]): List of supported file extensions
	"""

	logging.basicConfig(
	level=logging.DEBUG,
	format='%(asctime)s - %(levelname)s - %(message)s'
	)

	# Get settings with validation
	default_settings = Settings.get_document_processor_settings()

	# Use provided values or defaults from settings
	self.chunk_size = chunk_size if chunk_size is not None else default_settings[
	'chunk_size']
	self.chunk_overlap = chunk_overlap if chunk_overlap is not None else default_settings[
	'chunk_overlap']
	self.max_file_size = max_file_size if max_file_size is not None else default_settings[
	'max_file_size']
	self.supported_formats = supported_formats if supported_formats is not None else default_settings[
	'supported_formats']

	# Validate settings
	self._validate_settings()

	# Initialize existing components
	self.processing_queue = Queue()
	self.processed_docs = {}
	self._initialize_text_splitter()
	self.excel_processor = EnhancedExcelProcessor()

	# Check for required packages (keep existing functionality)
	try:
	import striprtf.striprtf
	except ImportError:
	logging.warning(
	"Warning: striprtf package not found. RTF support will be limited.")

	try:
	from bs4 import BeautifulSoup
	import lxml
	except ImportError:
	logging.warning(
	"Warning: beautifulsoup4 or lxml package not found. XML support will be limited.")

	def _validate_settings(self):
	"""Validate and adjust settings if necessary"""
	# Ensure chunk_size is positive and reasonable
	self.chunk_size = max(100, self.chunk_size)

	# Ensure chunk_overlap is less than chunk_size
	self.chunk_overlap = min(self.chunk_overlap, self.chunk_size - 50)

	# Ensure max_file_size is reasonable (minimum 1MB)
	self.max_file_size = max(1024 * 1024, self.max_file_size)

	# Ensure supported_formats contains valid extensions
	if not self.supported_formats:
	# Fallback to default supported formats if empty
	self.supported_formats = Settings.DOCUMENT_PROCESSOR['supported_formats']

	# Ensure all formats start with a dot
	self.supported_formats = [
	f".{fmt.lower().lstrip('.')}" if not fmt.startswith(
	'.') else fmt.lower()
	for fmt in self.supported_formats
	]

	def _initialize_text_splitter(self):
	"""Initialize the text splitter with custom settings"""
	self.text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=self.chunk_size,
	chunk_overlap=self.chunk_overlap,
	length_function=len,
	# Modify separators to better handle markdown while maintaining overlap
	separators=["\n\n", "\n", " ", ""],
	keep_separator=True,
	add_start_index=True,
	strip_whitespace=False # Keep whitespace to maintain markdown formatting
	)

	def split_text(self, text: str) -> List[str]:
	"""Split text with enforced overlap while preserving structure"""
	try:
	# Get initial split using RecursiveCharacterTextSplitter
	initial_chunks = self.text_splitter.split_text(text)
	if len(initial_chunks) <= 1:
	return initial_chunks

	# Process chunks with enforced overlap
	final_chunks = []

	for i, current_chunk in enumerate(initial_chunks):
	if i == 0:
	final_chunks.append(current_chunk)
	continue

	prev_chunk = final_chunks[-1]

	# Get the last part of previous chunk for overlap
	overlap_size = min(self.chunk_overlap, len(prev_chunk))
	overlap_text = prev_chunk[-overlap_size:]

	# For tables, include the header row
	if '\|' in current_chunk and '\n' in current_chunk:
	table_lines = current_chunk.split('\n')
	header_lines = []
	for line in table_lines:
	if line.strip().startswith('\|'):
	header_lines.append(line)
	else:
	break
	if header_lines:
	header_text = '\n'.join(header_lines) + '\n'
	overlap_text = header_text + overlap_text

	# Create new chunk with overlap
	new_chunk = overlap_text + current_chunk

	# Ensure we don't have duplicate content at the overlap point
	if current_chunk.startswith(overlap_text):
	new_chunk = current_chunk

	# Add context from previous chunk when needed
	if not any(marker in new_chunk for marker in ['AGENDA', 'DISCUSSIONS', '\| No \|']):
	context_markers = ['AGENDA',
	'DISCUSSIONS', '\| No \|']
	for marker in context_markers:
	if marker in prev_chunk and marker not in new_chunk:
	new_chunk = marker + "\n" + new_chunk
	break

	final_chunks.append(new_chunk)

	# Validate and log overlaps
	for i in range(len(final_chunks)-1):
	actual_overlap = self._find_actual_overlap(
	final_chunks[i], final_chunks[i+1])
	logging.debug(
	f"Overlap between chunks {i} and {i+1}: {len(actual_overlap)} characters")
	if len(actual_overlap) < self.chunk_overlap:
	logging.warning(
	f"Insufficient overlap between chunks {i} and {i+1}")

	return final_chunks

	for start, end in table_sections:
	# Process text before table if exists
	if start > current_position:
	non_table_text = text[current_position:start]
	if non_table_text.strip():
	text_chunks = self.text_splitter.split_text(
	non_table_text)
	if chunks and text_chunks:
	# Ensure overlap with previous chunk
	prev_chunk = chunks[-1]
	overlap = self._get_overlap_text(prev_chunk)
	text_chunks[0] = overlap + text_chunks[0]
	chunks.extend(text_chunks)

	# Process table as a single chunk with overlap
	table_text = text[start:end]
	if chunks:
	prev_chunk = chunks[-1]
	overlap = self._get_overlap_text(prev_chunk)
	table_text = overlap + table_text
	chunks.append(table_text)
	current_position = end

	# Process remaining text after last table
	if current_position < len(text):
	remaining_text = text[current_position:]
	if remaining_text.strip():
	text_chunks = self.text_splitter.split_text(remaining_text)
	if chunks and text_chunks:
	# Ensure overlap with previous chunk
	prev_chunk = chunks[-1]
	overlap = self._get_overlap_text(prev_chunk)
	text_chunks[0] = overlap + text_chunks[0]
	chunks.extend(text_chunks)

	# Validate and adjust overlaps
	chunks = self._ensure_minimum_overlap(chunks)

	# Log chunk details for debugging
	for i in range(len(chunks)-1):
	overlap = self._find_actual_overlap(chunks[i], chunks[i+1])
	logging.debug(
	f"Overlap between chunks {i} and {i+1}: {len(overlap)} characters")
	logging.debug(f"End of chunk {i}: {chunks[i][-50:]}")
	logging.debug(f"Start of chunk {i+1}: {chunks[i+1][:50]}")

	return chunks

	except Exception as e:
	logging.error(f"Error in split_text: {str(e)}")
	# Fallback to original text splitter
	return self.text_splitter.split_text(text)

	def _find_break_point(self, text: str, prev_chunk: str) -> int:
	"""
	Find suitable breaking point that maintains document structure

	Args:
	text (str): Text to find break point in (the overlap portion)
	prev_chunk (str): The complete previous chunk for context

	Returns:
	int: Position of suitable break point
	"""
	# Get the context of how the previous chunk ends
	prev_chunk_lines = prev_chunk.split('\n')

	# Special handling for markdown tables
	if '\|' in prev_chunk:
	# Check if we're in the middle of a table
	table_rows = [
	line for line in prev_chunk_lines if line.strip().startswith('\|')]
	if table_rows:
	# Find where the current table starts in the text
	table_start = text.find('\|')
	if table_start >= 0:
	# Find the next row boundary
	next_row = text.find('\n', table_start)
	if next_row >= 0:
	return next_row + 1 # Include the newline

	# Define break point markers in order of preference
	break_markers = [
	('\n\n', True), # Paragraph breaks (keep marker)
	('\n', True), # Line breaks (keep marker)
	('. ', True), # Sentence endings (keep marker)
	(', ', True), # Clause breaks (keep marker)
	(' ', False) # Word breaks (don't keep marker)
	]

	# Check the structure of the previous chunk end
	last_line = prev_chunk_lines[-1] if prev_chunk_lines else ""

	# Look for each type of break point
	for marker, keep_marker in break_markers:
	if marker in text:
	# Try to find a break point that maintains document structure
	marker_positions = [i for i in range(
	len(text)) if text[i:i+len(marker)] == marker]

	for pos in reversed(marker_positions):
	# Check if this break point would maintain document structure
	if self._is_valid_break_point(text, pos, last_line):
	return pos + (len(marker) if keep_marker else 0)

	# If no suitable break point found, default to exact position
	return min(len(text), self.chunk_overlap)

	def _is_valid_break_point(self, text: str, position: int, last_line: str) -> bool:
	"""
	Check if a break point would maintain document structure

	Args:
	text (str): Text being checked
	position (int): Potential break position
	last_line (str): Last line of previous chunk

	Returns:
	bool: True if break point is valid
	"""
	# Don't break in the middle of markdown formatting
	markdown_markers = ['*', '_', '`', '[', ']', '(', ')', '#']
	if position > 0 and position < len(text) - 1:
	if text[position-1] in markdown_markers or text[position+1] in markdown_markers:
	return False

	# Don't break in the middle of a table cell
	if '\|' in last_line:
	cell_count = last_line.count('\|')
	text_before_break = text[:position]
	if text_before_break.count('\|') % cell_count != 0:
	return False

	# Don't break URLs or code blocks
	url_patterns = ['http://', 'https://', '```', '`']
	for pattern in url_patterns:
	if pattern in text[:position] and pattern not in text[position:]:
	return False

	return True

	def _validate_chunks(self, original_text: str, chunks: List[str]) -> bool:
	"""Validate that chunks maintain document integrity"""
	try:
	# Remove overlap to check content
	reconstructed = chunks[0]
	for chunk in chunks[1:]:
	if len(chunk) > self.chunk_overlap:
	reconstructed += chunk[self.chunk_overlap:]

	# Clean both texts for comparison (remove extra whitespace)
	clean_original = ' '.join(original_text.split())
	clean_reconstructed = ' '.join(reconstructed.split())

	return clean_original == clean_reconstructed
	except Exception as e:
	logging.error(f"Error validating chunks: {str(e)}")
	return False

	def _extract_content(self, file_path: Path) -> str:
	"""Extract content from different file formats"""
	suffix = file_path.suffix.lower()

	try:
	if suffix == '.pdf':
	return self._extract_pdf(file_path)
	elif suffix == '.docx':
	return self._extract_docx(file_path)
	elif suffix == '.csv':
	return self._extract_csv(file_path)
	elif suffix == '.json':
	return self._extract_json(file_path)
	elif suffix == '.html':
	return self._extract_html(file_path)
	elif suffix == '.txt' or suffix == '.md':
	return self._extract_text(file_path)
	elif suffix == '.xml':
	return self._extract_xml(file_path)
	elif suffix == '.rtf':
	return self._extract_rtf(file_path)
	elif suffix in ['.xlsx', '.xls']:
	return self._extract_excel(file_path)
	else:
	raise ValueError(f"Unsupported format: {suffix}")
	except Exception as e:
	raise Exception(
	f"Error extracting content from {file_path}: {str(e)}")

	def _extract_text(self, file_path: Path) -> str:
	"""Extract content from text-based files"""
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	return f.read()
	except UnicodeDecodeError:
	with open(file_path, 'r', encoding='latin-1') as f:
	return f.read()

	def _extract_pdf(self, file_path: Path) -> str:
	"""Extract text from PDF with advanced features"""
	text = ""
	with open(file_path, 'rb') as file:
	reader = PyPDF2.PdfReader(file)
	metadata = reader.metadata

	for page in reader.pages:
	text += page.extract_text() + "\n\n"

	# Extract images if available
	if '/XObject' in page['/Resources']:
	for obj in page['/Resources']['/XObject'].get_object():
	if page['/Resources']['/XObject'][obj]['/Subtype'] == '/Image':
	pass

	return text.strip()

	def _extract_docx(self, file_path: Path) -> str:
	"""Extract text from DOCX with formatting"""
	doc = docx.Document(file_path)
	full_text = []

	for para in doc.paragraphs:
	full_text.append(para.text)

	for table in doc.tables:
	for row in table.rows:
	row_text = [cell.text for cell in row.cells]
	full_text.append(" \| ".join(row_text))

	return "\n\n".join(full_text)

	def _extract_csv(self, file_path: Path) -> str:
	"""Convert CSV to structured text"""
	df = pd.read_csv(file_path)
	return df.to_string()

	def _extract_json(self, file_path: Path) -> str:
	"""Convert JSON to readable text"""
	with open(file_path) as f:
	data = json.load(f)
	return json.dumps(data, indent=2)

	def _extract_html(self, file_path: Path) -> str:
	"""Extract text from HTML with structure preservation"""
	with open(file_path) as f:
	soup = BeautifulSoup(f, 'html.parser')

	for script in soup(["script", "style"]):
	script.decompose()

	text = soup.get_text(separator='\n')
	lines = [line.strip() for line in text.splitlines() if line.strip()]
	return "\n\n".join(lines)

	def _extract_xml(self, file_path: Path) -> str:
	"""Extract text from XML with structure preservation"""
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	soup = BeautifulSoup(f, 'xml')

	for pi in soup.find_all(text=lambda text: isinstance(text, ProcessingInstruction)):
	pi.extract()

	text = soup.get_text(separator='\n')
	lines = [line.strip()
	for line in text.splitlines() if line.strip()]
	return "\n\n".join(lines)
	except Exception as e:
	raise Exception(f"Error processing XML file: {str(e)}")

	def _extract_rtf(self, file_path: Path) -> str:
	"""Extract text from RTF files"""
	try:
	import striprtf.striprtf as striprtf

	with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
	rtf_text = f.read()

	plain_text = striprtf.rtf_to_text(rtf_text)
	lines = [line.strip()
	for line in plain_text.splitlines() if line.strip()]
	return "\n\n".join(lines)
	except ImportError:
	raise ImportError("striprtf package is required for RTF support.")
	except Exception as e:
	raise Exception(f"Error processing RTF file: {str(e)}")

	def _extract_excel(self, file_path: Path) -> str:
	"""Extract content from Excel files with enhanced processing"""
	try:
	# Use enhanced Excel processor
	processed_content = self.excel_processor.process_excel(file_path)

	# If processing fails, fall back to basic processing
	if not processed_content:
	logging.warning(
	f"Enhanced Excel processing failed for {file_path}, falling back to basic processing")
	return self._basic_excel_extract(file_path)

	return processed_content

	except Exception as e:
	logging.error(f"Error in enhanced Excel processing: {str(e)}")
	# Fall back to basic Excel processing
	return self._basic_excel_extract(file_path)

	def _basic_excel_extract(self, file_path: Path) -> str:
	"""Basic Excel extraction as fallback"""
	try:
	excel_file = pd.ExcelFile(file_path)
	sheets_data = []

	for sheet_name in excel_file.sheet_names:
	df = pd.read_excel(excel_file, sheet_name=sheet_name)
	sheet_content = f"\nSheet: {sheet_name}\n"
	sheet_content += "=" * (len(sheet_name) + 7) + "\n"

	if df.empty:
	sheet_content += "Empty Sheet\n"
	else:
	sheet_content += df.fillna('').to_string(
	index=False,
	max_rows=None,
	max_cols=None,
	line_width=120
	) + "\n"

	sheets_data.append(sheet_content)

	return "\n\n".join(sheets_data)

	except Exception as e:
	raise Exception(f"Error in basic Excel processing: {str(e)}")

	def _get_mime_type(self, file_path: Path) -> str:
	"""
	Get MIME type for a file based on its extension

	Args:
	file_path (Path): Path to the file

	Returns:
	str: MIME type of the file
	"""
	# Standard MIME mappings for supported formats
	MIME_MAPPINGS = {
	'.pdf': 'application/pdf',
	'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
	'.doc': 'application/msword',
	'.csv': 'text/csv',
	'.json': 'application/json',
	'.html': 'text/html',
	'.txt': 'text/plain',
	'.md': 'text/markdown',
	'.xml': 'text/xml',
	'.rtf': 'application/rtf',
	'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
	'.xls': 'application/vnd.ms-excel'
	}

	suffix = file_path.suffix.lower()

	# Verify the file format is supported
	if suffix not in self.supported_formats:
	logging.warning(f"Unsupported file format: {suffix}")
	return 'application/octet-stream'

	# Return known MIME type or fall back to mimetypes module
	if suffix in MIME_MAPPINGS:
	return MIME_MAPPINGS[suffix]

	mime_type = mimetypes.guess_type(str(file_path))[0]
	return mime_type if mime_type else 'application/octet-stream'

	def _generate_metadata(
	self,
	file_path: Path,
	content: str,
	additional_metadata: Optional[Dict] = None
	) -> Dict:
	"""Generate comprehensive metadata"""
	file_stat = file_path.stat()

	metadata = {
	'filename': file_path.name,
	'file_type': file_path.suffix,
	'file_size': file_stat.st_size,
	'created_at': datetime.fromtimestamp(file_stat.st_ctime),
	'modified_at': datetime.fromtimestamp(file_stat.st_mtime),
	'content_hash': self._calculate_hash(content),
	'mime_type': self._get_mime_type(file_path),
	'word_count': len(content.split()),
	'character_count': len(content),
	'processing_timestamp': datetime.now().isoformat()
	}

	# Add Excel-specific metadata if applicable
	if file_path.suffix.lower() in ['.xlsx', '.xls']:
	try:
	if hasattr(self.excel_processor, 'get_metadata'):
	excel_metadata = self.excel_processor.get_metadata()
	metadata.update({'excel_metadata': excel_metadata})
	except Exception as e:
	logging.warning(f"Could not extract Excel metadata: {str(e)}")

	if additional_metadata:
	metadata.update(additional_metadata)

	return metadata

	# def _generate_metadata(
	# self,
	# file_path: Path,
	# content: str,
	# additional_metadata: Optional[Dict] = None
	# ) -> Dict:
	# """Generate comprehensive metadata"""
	# file_stat = file_path.stat()

	# metadata = {
	# 'filename': file_path.name,
	# 'file_type': file_path.suffix,
	# 'file_size': file_stat.st_size,
	# 'created_at': datetime.fromtimestamp(file_stat.st_ctime),
	# 'modified_at': datetime.fromtimestamp(file_stat.st_mtime),
	# 'content_hash': self._calculate_hash(content),
	# 'mime_type': magic.from_file(str(file_path), mime=True),
	# 'word_count': len(content.split()),
	# 'character_count': len(content),
	# 'processing_timestamp': datetime.now().isoformat()
	# }

	# # Add Excel-specific metadata if applicable
	# if file_path.suffix.lower() in ['.xlsx', '.xls']:
	# try:
	# if hasattr(self.excel_processor, 'get_metadata'):
	# excel_metadata = self.excel_processor.get_metadata()
	# metadata.update({'excel_metadata': excel_metadata})
	# except Exception as e:
	# logging.warning(f"Could not extract Excel metadata: {str(e)}")

	# if additional_metadata:
	# metadata.update(additional_metadata)

	# return metadata

	def _calculate_hash(self, text: str) -> str:
	"""Calculate SHA-256 hash of text"""
	return hashlib.sha256(text.encode()).hexdigest()

	async def process_document(self, file_path: Union[str, Path], metadata: Optional[Dict] = None) -> Dict:
	"""Process a document with metadata and content extraction"""
	file_path = Path(file_path)

	if not self._validate_file(file_path):
	raise ValueError(f"Invalid file: {file_path}")

	content = self._extract_content(file_path)
	doc_metadata = self._generate_metadata(file_path, content, metadata)

	# Try enhanced splitting with validation
	chunks = self.split_text(content)
	if not self._validate_chunks(content, chunks):
	logging.warning(
	"Enhanced splitting failed validation, falling back to original splitter")
	chunks = self.text_splitter.split_text(content)

	# Add logging to verify chunk overlap
	for i in range(len(chunks)-1):
	logging.debug(f"Chunk {i} ends with: {chunks[i][-50:]}")
	logging.debug(f"Chunk {i+1} starts with: {chunks[i+1][:50]}")
	logging.debug(
	f"Overlap size: {self._calculate_overlap_size(chunks[i], chunks[i+1])} characters")

	chunk_hashes = [self._calculate_hash(chunk) for chunk in chunks]

	return {
	'content': content,
	'chunks': chunks,
	'chunk_hashes': chunk_hashes,
	'metadata': doc_metadata,
	'statistics': self._generate_statistics(content, chunks)
	}

	def _calculate_overlap_size(self, chunk1: str, chunk2: str) -> int:
	"""Calculate the size of overlap between two chunks"""
	min_len = min(len(chunk1), len(chunk2))
	for i in range(min_len, 0, -1):
	if chunk1[-i:] == chunk2[:i]:
	return i
	return 0

	def _validate_file(self, file_path: Path) -> bool:
	"""Validate file type, size, and content"""
	if not file_path.exists():
	raise FileNotFoundError(f"File not found: {file_path}")

	if file_path.suffix.lower() not in self.supported_formats:
	raise ValueError(f"Unsupported file format: {file_path.suffix}")

	if file_path.stat().st_size > self.max_file_size:
	raise ValueError(f"File too large: {file_path}")

	if file_path.stat().st_size == 0:
	raise ValueError(f"Empty file: {file_path}")

	return True

	def _generate_statistics(self, content: str, chunks: List[str]) -> Dict:
	"""Generate document statistics"""
	return {
	'total_chunks': len(chunks),
	'average_chunk_size': sum(len(chunk) for chunk in chunks) / len(chunks),
	'token_estimate': len(content.split()),
	'unique_words': len(set(content.lower().split())),
	'sentences': len([s for s in content.split('.') if s.strip()]),
	}

	async def batch_process(
	self,
	file_paths: List[Union[str, Path]],
	parallel: bool = True
	) -> Dict[str, Dict]:
	"""Process multiple documents in parallel"""
	results = {}

	if parallel:
	threads = []
	for file_path in file_paths:
	thread = threading.Thread(
	target=self._process_and_store,
	args=(file_path, results)
	)
	threads.append(thread)
	thread.start()

	for thread in threads:
	thread.join()
	else:
	for file_path in file_paths:
	await self._process_and_store(file_path, results)

	return results

	async def _process_and_store(
	self,
	file_path: Union[str, Path],
	results: Dict
	):
	"""Process a single document and store results"""
	try:
	result = await self.process_document(file_path)
	results[str(file_path)] = result
	except Exception as e:
	results[str(file_path)] = {'error': str(e)}