# src/utils/document_processor.py from typing import List, Dict, Optional, Union import PyPDF2 import docx import pandas as pd import json from pathlib import Path import hashlib import magic # python-magic library for file type detection from bs4 import BeautifulSoup import requests import csv from datetime import datetime import threading from queue import Queue import tiktoken from langchain.text_splitter import RecursiveCharacterTextSplitter class DocumentProcessor: def __init__( self, chunk_size: int = 1000, chunk_overlap: int = 200, max_file_size: int = 10 * 1024 * 1024, # 10MB supported_formats: Optional[List[str]] = None ): self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap self.max_file_size = max_file_size self.supported_formats = supported_formats or [ '.txt', '.pdf', '.docx', '.csv', '.json', '.html', '.md', '.xml', '.rtf' ] self.processing_queue = Queue() self.processed_docs = {} self._initialize_text_splitter() def _initialize_text_splitter(self): """Initialize the text splitter with custom settings""" self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap, length_function=len, separators=["\n\n", "\n", " ", ""] ) async def process_document( self, file_path: Union[str, Path], metadata: Optional[Dict] = None ) -> Dict: """ Process a document with metadata and content extraction """ file_path = Path(file_path) # Basic validation if not self._validate_file(file_path): raise ValueError(f"Invalid file: {file_path}") # Extract content based on file type content = self._extract_content(file_path) # Generate document metadata doc_metadata = self._generate_metadata(file_path, content, metadata) # Split content into chunks chunks = self.text_splitter.split_text(content) # Calculate embeddings chunk hashes chunk_hashes = [self._calculate_hash(chunk) for chunk in chunks] return { 'content': content, 'chunks': chunks, 'chunk_hashes': chunk_hashes, 'metadata': doc_metadata, 'statistics': self._generate_statistics(content, chunks) } def _validate_file(self, file_path: Path) -> bool: """ Validate file type, size, and content """ if not file_path.exists(): raise FileNotFoundError(f"File not found: {file_path}") if file_path.suffix.lower() not in self.supported_formats: raise ValueError(f"Unsupported file format: {file_path.suffix}") if file_path.stat().st_size > self.max_file_size: raise ValueError(f"File too large: {file_path}") # Check if file is not empty if file_path.stat().st_size == 0: raise ValueError(f"Empty file: {file_path}") return True def _extract_content(self, file_path: Path) -> str: """ Extract content from different file formats """ suffix = file_path.suffix.lower() try: if suffix == '.pdf': return self._extract_pdf(file_path) elif suffix == '.docx': return self._extract_docx(file_path) elif suffix == '.csv': return self._extract_csv(file_path) elif suffix == '.json': return self._extract_json(file_path) elif suffix == '.html': return self._extract_html(file_path) elif suffix == '.txt': return file_path.read_text(encoding='utf-8') else: raise ValueError(f"Unsupported format: {suffix}") except Exception as e: raise Exception(f"Error extracting content from {file_path}: {str(e)}") def _extract_pdf(self, file_path: Path) -> str: """Extract text from PDF with advanced features""" text = "" with open(file_path, 'rb') as file: reader = PyPDF2.PdfReader(file) metadata = reader.metadata for page in reader.pages: text += page.extract_text() + "\n\n" # Extract images if available if '/XObject' in page['/Resources']: for obj in page['/Resources']['/XObject'].get_object(): if page['/Resources']['/XObject'][obj]['/Subtype'] == '/Image': # Process images if needed pass return text.strip() def _extract_docx(self, file_path: Path) -> str: """Extract text from DOCX with formatting""" doc = docx.Document(file_path) full_text = [] for para in doc.paragraphs: full_text.append(para.text) # Extract tables if present for table in doc.tables: for row in table.rows: row_text = [cell.text for cell in row.cells] full_text.append(" | ".join(row_text)) return "\n\n".join(full_text) def _extract_csv(self, file_path: Path) -> str: """Convert CSV to structured text""" df = pd.read_csv(file_path) return df.to_string() def _extract_json(self, file_path: Path) -> str: """Convert JSON to readable text""" with open(file_path) as f: data = json.load(f) return json.dumps(data, indent=2) def _extract_html(self, file_path: Path) -> str: """Extract text from HTML with structure preservation""" with open(file_path) as f: soup = BeautifulSoup(f, 'html.parser') # Remove script and style elements for script in soup(["script", "style"]): script.decompose() text = soup.get_text(separator='\n') lines = [line.strip() for line in text.splitlines() if line.strip()] return "\n\n".join(lines) def _generate_metadata( self, file_path: Path, content: str, additional_metadata: Optional[Dict] = None ) -> Dict: """Generate comprehensive metadata""" file_stat = file_path.stat() metadata = { 'filename': file_path.name, 'file_type': file_path.suffix, 'file_size': file_stat.st_size, 'created_at': datetime.fromtimestamp(file_stat.st_ctime), 'modified_at': datetime.fromtimestamp(file_stat.st_mtime), 'content_hash': self._calculate_hash(content), 'mime_type': magic.from_file(str(file_path), mime=True), 'word_count': len(content.split()), 'character_count': len(content), 'processing_timestamp': datetime.now().isoformat() } if additional_metadata: metadata.update(additional_metadata) return metadata def _generate_statistics(self, content: str, chunks: List[str]) -> Dict: """Generate document statistics""" return { 'total_chunks': len(chunks), 'average_chunk_size': sum(len(chunk) for chunk in chunks) / len(chunks), 'token_estimate': len(content.split()), 'unique_words': len(set(content.lower().split())), 'sentences': len([s for s in content.split('.') if s.strip()]), } def _calculate_hash(self, text: str) -> str: """Calculate SHA-256 hash of text""" return hashlib.sha256(text.encode()).hexdigest() async def batch_process( self, file_paths: List[Union[str, Path]], parallel: bool = True ) -> Dict[str, Dict]: """ Process multiple documents in parallel """ results = {} if parallel: threads = [] for file_path in file_paths: thread = threading.Thread( target=self._process_and_store, args=(file_path, results) ) threads.append(thread) thread.start() for thread in threads: thread.join() else: for file_path in file_paths: await self._process_and_store(file_path, results) return results async def _process_and_store( self, file_path: Union[str, Path], results: Dict ): """Process a single document and store results""" try: result = await self.process_document(file_path) results[str(file_path)] = result except Exception as e: results[str(file_path)] = {'error': str(e)}