# src/utils/document_processor.py from typing import List, Dict, Optional, Union import PyPDF2 import docx import pandas as pd import json from pathlib import Path import hashlib import mimetypes # Add this instead from bs4 import BeautifulSoup import csv from datetime import datetime import threading from queue import Queue import tiktoken from langchain.text_splitter import RecursiveCharacterTextSplitter import logging from bs4.element import ProcessingInstruction from config.config import Settings from .enhanced_excel_processor import EnhancedExcelProcessor class DocumentProcessor: def __init__( self, chunk_size: Optional[int] = None, chunk_overlap: Optional[int] = None, max_file_size: Optional[int] = None, supported_formats: Optional[List[str]] = None ): """ Initialize DocumentProcessor with configurable parameters Args: chunk_size (Optional[int]): Size of text chunks chunk_overlap (Optional[int]): Overlap between chunks max_file_size (Optional[int]): Maximum file size in bytes supported_formats (Optional[List[str]]): List of supported file extensions """ logging.basicConfig( level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s' ) # Get settings with validation default_settings = Settings.get_document_processor_settings() # Use provided values or defaults from settings self.chunk_size = chunk_size if chunk_size is not None else default_settings[ 'chunk_size'] self.chunk_overlap = chunk_overlap if chunk_overlap is not None else default_settings[ 'chunk_overlap'] self.max_file_size = max_file_size if max_file_size is not None else default_settings[ 'max_file_size'] self.supported_formats = supported_formats if supported_formats is not None else default_settings[ 'supported_formats'] # Validate settings self._validate_settings() # Initialize existing components self.processing_queue = Queue() self.processed_docs = {} self._initialize_text_splitter() self.excel_processor = EnhancedExcelProcessor() # Check for required packages (keep existing functionality) try: import striprtf.striprtf except ImportError: logging.warning( "Warning: striprtf package not found. RTF support will be limited.") try: from bs4 import BeautifulSoup import lxml except ImportError: logging.warning( "Warning: beautifulsoup4 or lxml package not found. XML support will be limited.") def _validate_settings(self): """Validate and adjust settings if necessary""" # Ensure chunk_size is positive and reasonable self.chunk_size = max(100, self.chunk_size) # Ensure chunk_overlap is less than chunk_size self.chunk_overlap = min(self.chunk_overlap, self.chunk_size - 50) # Ensure max_file_size is reasonable (minimum 1MB) self.max_file_size = max(1024 * 1024, self.max_file_size) # Ensure supported_formats contains valid extensions if not self.supported_formats: # Fallback to default supported formats if empty self.supported_formats = Settings.DOCUMENT_PROCESSOR['supported_formats'] # Ensure all formats start with a dot self.supported_formats = [ f".{fmt.lower().lstrip('.')}" if not fmt.startswith( '.') else fmt.lower() for fmt in self.supported_formats ] def _initialize_text_splitter(self): """Initialize the text splitter with custom settings""" self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap, length_function=len, # Modify separators to better handle markdown while maintaining overlap separators=["\n\n", "\n", " ", ""], keep_separator=True, add_start_index=True, strip_whitespace=False # Keep whitespace to maintain markdown formatting ) def _find_break_point(self, text: str, prev_chunk: str) -> int: """ Find suitable breaking point that maintains document structure Args: text (str): Text to find break point in (the overlap portion) prev_chunk (str): The complete previous chunk for context Returns: int: Position of suitable break point """ # Get the context of how the previous chunk ends prev_chunk_lines = prev_chunk.split('\n') # Special handling for markdown tables if '|' in prev_chunk: # Check if we're in the middle of a table table_rows = [ line for line in prev_chunk_lines if line.strip().startswith('|')] if table_rows: # Find where the current table starts in the text table_start = text.find('|') if table_start >= 0: # Find the next row boundary next_row = text.find('\n', table_start) if next_row >= 0: return next_row + 1 # Include the newline # Define break point markers in order of preference break_markers = [ ('\n\n', True), # Paragraph breaks (keep marker) ('\n', True), # Line breaks (keep marker) ('. ', True), # Sentence endings (keep marker) (', ', True), # Clause breaks (keep marker) (' ', False) # Word breaks (don't keep marker) ] # Check the structure of the previous chunk end last_line = prev_chunk_lines[-1] if prev_chunk_lines else "" # Look for each type of break point for marker, keep_marker in break_markers: if marker in text: # Try to find a break point that maintains document structure marker_positions = [i for i in range( len(text)) if text[i:i+len(marker)] == marker] for pos in reversed(marker_positions): # Check if this break point would maintain document structure if self._is_valid_break_point(text, pos, last_line): return pos + (len(marker) if keep_marker else 0) # If no suitable break point found, default to exact position return min(len(text), self.chunk_overlap) def _is_valid_break_point(self, text: str, position: int, last_line: str) -> bool: """ Check if a break point would maintain document structure Args: text (str): Text being checked position (int): Potential break position last_line (str): Last line of previous chunk Returns: bool: True if break point is valid """ # Don't break in the middle of markdown formatting markdown_markers = ['*', '_', '`', '[', ']', '(', ')', '#'] if position > 0 and position < len(text) - 1: if text[position-1] in markdown_markers or text[position+1] in markdown_markers: return False # Don't break in the middle of a table cell if '|' in last_line: cell_count = last_line.count('|') text_before_break = text[:position] if text_before_break.count('|') % cell_count != 0: return False # Don't break URLs or code blocks url_patterns = ['http://', 'https://', '```', '`'] for pattern in url_patterns: if pattern in text[:position] and pattern not in text[position:]: return False return True def _validate_chunks(self, original_text: str, chunks: List[str]) -> bool: """Validate that chunks maintain document integrity""" try: # Remove overlap to check content reconstructed = chunks[0] for chunk in chunks[1:]: if len(chunk) > self.chunk_overlap: reconstructed += chunk[self.chunk_overlap:] # Clean both texts for comparison (remove extra whitespace) clean_original = ' '.join(original_text.split()) clean_reconstructed = ' '.join(reconstructed.split()) return clean_original == clean_reconstructed except Exception as e: logging.error(f"Error validating chunks: {str(e)}") return False def _extract_content(self, file_path: Path) -> str: """Extract content from different file formats""" suffix = file_path.suffix.lower() try: if suffix == '.pdf': return self._extract_pdf(file_path) elif suffix == '.docx': return self._extract_docx(file_path) elif suffix == '.csv': return self._extract_csv(file_path) elif suffix == '.json': return self._extract_json(file_path) elif suffix == '.html': return self._extract_html(file_path) elif suffix == '.txt' or suffix == '.md': return self._extract_text(file_path) elif suffix == '.xml': return self._extract_xml(file_path) elif suffix == '.rtf': return self._extract_rtf(file_path) elif suffix in ['.xlsx', '.xls']: return self._extract_excel(file_path) else: raise ValueError(f"Unsupported format: {suffix}") except Exception as e: raise Exception( f"Error extracting content from {file_path}: {str(e)}") def _extract_text(self, file_path: Path) -> str: """Extract content from text-based files""" try: with open(file_path, 'r', encoding='utf-8') as f: return f.read() except UnicodeDecodeError: with open(file_path, 'r', encoding='latin-1') as f: return f.read() def _extract_pdf(self, file_path: Path) -> str: """Extract text from PDF with advanced features""" text = "" with open(file_path, 'rb') as file: reader = PyPDF2.PdfReader(file) metadata = reader.metadata for page in reader.pages: text += page.extract_text() + "\n\n" # Extract images if available if '/XObject' in page['/Resources']: for obj in page['/Resources']['/XObject'].get_object(): if page['/Resources']['/XObject'][obj]['/Subtype'] == '/Image': pass return text.strip() def _extract_docx(self, file_path: Path) -> str: """Extract text from DOCX with formatting""" doc = docx.Document(file_path) full_text = [] for para in doc.paragraphs: full_text.append(para.text) for table in doc.tables: for row in table.rows: row_text = [cell.text for cell in row.cells] full_text.append(" | ".join(row_text)) return "\n\n".join(full_text) def _extract_csv(self, file_path: Path) -> str: """Convert CSV to structured text""" df = pd.read_csv(file_path) return df.to_string() def _extract_json(self, file_path: Path) -> str: """Convert JSON to readable text""" with open(file_path) as f: data = json.load(f) return json.dumps(data, indent=2) def _extract_html(self, file_path: Path) -> str: """Extract text from HTML with structure preservation""" with open(file_path) as f: soup = BeautifulSoup(f, 'html.parser') for script in soup(["script", "style"]): script.decompose() text = soup.get_text(separator='\n') lines = [line.strip() for line in text.splitlines() if line.strip()] return "\n\n".join(lines) def _extract_xml(self, file_path: Path) -> str: """Extract text from XML with structure preservation""" try: with open(file_path, 'r', encoding='utf-8') as f: soup = BeautifulSoup(f, 'xml') for pi in soup.find_all(text=lambda text: isinstance(text, ProcessingInstruction)): pi.extract() text = soup.get_text(separator='\n') lines = [line.strip() for line in text.splitlines() if line.strip()] return "\n\n".join(lines) except Exception as e: raise Exception(f"Error processing XML file: {str(e)}") def _extract_rtf(self, file_path: Path) -> str: """Extract text from RTF files""" try: import striprtf.striprtf as striprtf with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: rtf_text = f.read() plain_text = striprtf.rtf_to_text(rtf_text) lines = [line.strip() for line in plain_text.splitlines() if line.strip()] return "\n\n".join(lines) except ImportError: raise ImportError("striprtf package is required for RTF support.") except Exception as e: raise Exception(f"Error processing RTF file: {str(e)}") def _extract_excel(self, file_path: Path) -> str: """Extract content from Excel files with enhanced processing""" try: # Use enhanced Excel processor processed_content = self.excel_processor.process_excel(file_path) # If processing fails, fall back to basic processing if not processed_content: logging.warning( f"Enhanced Excel processing failed for {file_path}, falling back to basic processing") return self._basic_excel_extract(file_path) return processed_content except Exception as e: logging.error(f"Error in enhanced Excel processing: {str(e)}") # Fall back to basic Excel processing return self._basic_excel_extract(file_path) def _basic_excel_extract(self, file_path: Path) -> str: """Basic Excel extraction as fallback""" try: excel_file = pd.ExcelFile(file_path) sheets_data = [] for sheet_name in excel_file.sheet_names: df = pd.read_excel(excel_file, sheet_name=sheet_name) sheet_content = f"\nSheet: {sheet_name}\n" sheet_content += "=" * (len(sheet_name) + 7) + "\n" if df.empty: sheet_content += "Empty Sheet\n" else: sheet_content += df.fillna('').to_string( index=False, max_rows=None, max_cols=None, line_width=120 ) + "\n" sheets_data.append(sheet_content) return "\n\n".join(sheets_data) except Exception as e: raise Exception(f"Error in basic Excel processing: {str(e)}") def _get_mime_type(self, file_path: Path) -> str: """ Get MIME type for a file based on its extension Args: file_path (Path): Path to the file Returns: str: MIME type of the file """ # Standard MIME mappings for supported formats MIME_MAPPINGS = { '.pdf': 'application/pdf', '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', '.doc': 'application/msword', '.csv': 'text/csv', '.json': 'application/json', '.html': 'text/html', '.txt': 'text/plain', '.md': 'text/markdown', '.xml': 'text/xml', '.rtf': 'application/rtf', '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', '.xls': 'application/vnd.ms-excel' } suffix = file_path.suffix.lower() # Verify the file format is supported if suffix not in self.supported_formats: logging.warning(f"Unsupported file format: {suffix}") return 'application/octet-stream' # Return known MIME type or fall back to mimetypes module if suffix in MIME_MAPPINGS: return MIME_MAPPINGS[suffix] mime_type = mimetypes.guess_type(str(file_path))[0] return mime_type if mime_type else 'application/octet-stream' def _generate_metadata( self, file_path: Path, content: str, additional_metadata: Optional[Dict] = None ) -> Dict: """Generate comprehensive metadata""" file_stat = file_path.stat() metadata = { 'filename': file_path.name, 'file_type': file_path.suffix, 'file_size': file_stat.st_size, 'created_at': datetime.fromtimestamp(file_stat.st_ctime), 'modified_at': datetime.fromtimestamp(file_stat.st_mtime), 'content_hash': self._calculate_hash(content), 'mime_type': self._get_mime_type(file_path), 'word_count': len(content.split()), 'character_count': len(content), 'processing_timestamp': datetime.now().isoformat() } # Add Excel-specific metadata if applicable if file_path.suffix.lower() in ['.xlsx', '.xls']: try: if hasattr(self.excel_processor, 'get_metadata'): excel_metadata = self.excel_processor.get_metadata() metadata.update({'excel_metadata': excel_metadata}) except Exception as e: logging.warning(f"Could not extract Excel metadata: {str(e)}") if additional_metadata: metadata.update(additional_metadata) return metadata # def _generate_metadata( # self, # file_path: Path, # content: str, # additional_metadata: Optional[Dict] = None # ) -> Dict: # """Generate comprehensive metadata""" # file_stat = file_path.stat() # metadata = { # 'filename': file_path.name, # 'file_type': file_path.suffix, # 'file_size': file_stat.st_size, # 'created_at': datetime.fromtimestamp(file_stat.st_ctime), # 'modified_at': datetime.fromtimestamp(file_stat.st_mtime), # 'content_hash': self._calculate_hash(content), # 'mime_type': magic.from_file(str(file_path), mime=True), # 'word_count': len(content.split()), # 'character_count': len(content), # 'processing_timestamp': datetime.now().isoformat() # } # # Add Excel-specific metadata if applicable # if file_path.suffix.lower() in ['.xlsx', '.xls']: # try: # if hasattr(self.excel_processor, 'get_metadata'): # excel_metadata = self.excel_processor.get_metadata() # metadata.update({'excel_metadata': excel_metadata}) # except Exception as e: # logging.warning(f"Could not extract Excel metadata: {str(e)}") # if additional_metadata: # metadata.update(additional_metadata) # return metadata def _calculate_hash(self, text: str) -> str: """Calculate SHA-256 hash of text""" return hashlib.sha256(text.encode()).hexdigest() def _process_chunks(self, text: str) -> List[str]: """Process text into chunks with proper overlap""" chunks = self.text_splitter.split_text(text) # Ensure minimum chunk size and handle overlaps processed_chunks = [] for i, chunk in enumerate(chunks): if i > 0: # Add overlap from previous chunk overlap_start = max( 0, len(processed_chunks[-1]) - self.chunk_overlap) chunk = processed_chunks[-1][overlap_start:] + chunk if len(chunk) > self.chunk_size: # Split oversized chunks sub_chunks = self.text_splitter.split_text(chunk) processed_chunks.extend(sub_chunks) else: processed_chunks.append(chunk) return processed_chunks async def process_document(self, file_path: Union[str, Path]) -> Dict: """Process document with chunk overlapping""" file_path = Path(file_path) if not self._validate_file(file_path): raise ValueError(f"Invalid file: {file_path}") content = self._extract_content(file_path) chunks = self._process_chunks(content) return { 'content': content, 'chunks': chunks, 'metadata': self._generate_metadata(file_path, content) } def _calculate_overlap_size(self, chunk1: str, chunk2: str) -> int: """Calculate the size of overlap between two chunks""" min_len = min(len(chunk1), len(chunk2)) for i in range(min_len, 0, -1): if chunk1[-i:] == chunk2[:i]: return i return 0 def _validate_file(self, file_path: Path) -> bool: """Validate file type, size, and content""" if not file_path.exists(): raise FileNotFoundError(f"File not found: {file_path}") if file_path.suffix.lower() not in self.supported_formats: raise ValueError(f"Unsupported file format: {file_path.suffix}") if file_path.stat().st_size > self.max_file_size: raise ValueError(f"File too large: {file_path}") if file_path.stat().st_size == 0: raise ValueError(f"Empty file: {file_path}") return True def _generate_statistics(self, content: str, chunks: List[str]) -> Dict: """Generate document statistics""" return { 'total_chunks': len(chunks), 'average_chunk_size': sum(len(chunk) for chunk in chunks) / len(chunks), 'token_estimate': len(content.split()), 'unique_words': len(set(content.lower().split())), 'sentences': len([s for s in content.split('.') if s.strip()]), } async def batch_process( self, file_paths: List[Union[str, Path]], parallel: bool = True ) -> Dict[str, Dict]: """Process multiple documents in parallel""" results = {} if parallel: threads = [] for file_path in file_paths: thread = threading.Thread( target=self._process_and_store, args=(file_path, results) ) threads.append(thread) thread.start() for thread in threads: thread.join() else: for file_path in file_paths: await self._process_and_store(file_path, results) return results async def _process_and_store( self, file_path: Union[str, Path], results: Dict ): """Process a single document and store results""" try: result = await self.process_document(file_path) results[str(file_path)] = result except Exception as e: results[str(file_path)] = {'error': str(e)}