Spaces:

TalatMasud
/

chatbot-backend

Running

File size: 13,689 Bytes

# src/utils/document_processor.py
from typing import List, Dict, Optional, Union
import PyPDF2
import docx
import pandas as pd
import json
from pathlib import Path
import hashlib
import magic  # python-magic library for file type detection
from bs4 import BeautifulSoup
import csv
from datetime import datetime
import threading
from queue import Queue
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter
import logging
from bs4.element import ProcessingInstruction
from .enhanced_excel_processor import EnhancedExcelProcessor

class DocumentProcessor:
    def __init__(
        self,
        chunk_size: int = 1000,
        chunk_overlap: int = 200,
        max_file_size: int = 10 * 1024 * 1024,  # 10MB
        supported_formats: Optional[List[str]] = None
    ):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.max_file_size = max_file_size
        self.supported_formats = supported_formats or [
            '.txt', '.pdf', '.docx', '.csv', '.json', 
            '.html', '.md', '.xml', '.rtf', '.xlsx', '.xls'
        ]
        self.processing_queue = Queue()
        self.processed_docs = {}
        self._initialize_text_splitter()
        
        # Initialize Excel processor
        self.excel_processor = EnhancedExcelProcessor()
        
        # Check for required packages
        try:
            import striprtf.striprtf
        except ImportError:
            logging.warning("Warning: striprtf package not found. RTF support will be limited.")
        
        try:
            from bs4 import BeautifulSoup
            import lxml
        except ImportError:
            logging.warning("Warning: beautifulsoup4 or lxml package not found. XML support will be limited.")

    def _initialize_text_splitter(self):
        """Initialize the text splitter with custom settings"""
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
            length_function=len,
            separators=["\n\n", "\n", " ", ""]
        )

    def _extract_content(self, file_path: Path) -> str:
        """Extract content from different file formats"""
        suffix = file_path.suffix.lower()
    
        try:
            if suffix == '.pdf':
                return self._extract_pdf(file_path)
            elif suffix == '.docx':
                return self._extract_docx(file_path)
            elif suffix == '.csv':
                return self._extract_csv(file_path)
            elif suffix == '.json':
                return self._extract_json(file_path)
            elif suffix == '.html':
                return self._extract_html(file_path)
            elif suffix == '.txt' or suffix == '.md':
                return self._extract_text(file_path)
            elif suffix == '.xml':
                return self._extract_xml(file_path)
            elif suffix == '.rtf':
                return self._extract_rtf(file_path)
            elif suffix in ['.xlsx', '.xls']:
                return self._extract_excel(file_path)
            else:
                raise ValueError(f"Unsupported format: {suffix}")
        except Exception as e:
            raise Exception(f"Error extracting content from {file_path}: {str(e)}")

    def _extract_text(self, file_path: Path) -> str:
        """Extract content from text-based files"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return f.read()
        except UnicodeDecodeError:
            with open(file_path, 'r', encoding='latin-1') as f:
                return f.read()

    def _extract_pdf(self, file_path: Path) -> str:
        """Extract text from PDF with advanced features"""
        text = ""
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            metadata = reader.metadata
            
            for page in reader.pages:
                text += page.extract_text() + "\n\n"
                
                # Extract images if available
                if '/XObject' in page['/Resources']:
                    for obj in page['/Resources']['/XObject'].get_object():
                        if page['/Resources']['/XObject'][obj]['/Subtype'] == '/Image':
                            pass
                            
        return text.strip()

    def _extract_docx(self, file_path: Path) -> str:
        """Extract text from DOCX with formatting"""
        doc = docx.Document(file_path)
        full_text = []
        
        for para in doc.paragraphs:
            full_text.append(para.text)
            
        for table in doc.tables:
            for row in table.rows:
                row_text = [cell.text for cell in row.cells]
                full_text.append(" | ".join(row_text))
                
        return "\n\n".join(full_text)

    def _extract_csv(self, file_path: Path) -> str:
        """Convert CSV to structured text"""
        df = pd.read_csv(file_path)
        return df.to_string()

    def _extract_json(self, file_path: Path) -> str:
        """Convert JSON to readable text"""
        with open(file_path) as f:
            data = json.load(f)
        return json.dumps(data, indent=2)

    def _extract_html(self, file_path: Path) -> str:
        """Extract text from HTML with structure preservation"""
        with open(file_path) as f:
            soup = BeautifulSoup(f, 'html.parser')
            
        for script in soup(["script", "style"]):
            script.decompose()
            
        text = soup.get_text(separator='\n')
        lines = [line.strip() for line in text.splitlines() if line.strip()]
        return "\n\n".join(lines)

    def _extract_xml(self, file_path: Path) -> str:
        """Extract text from XML with structure preservation"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                soup = BeautifulSoup(f, 'xml')
            
            for pi in soup.find_all(text=lambda text: isinstance(text, ProcessingInstruction)):
                pi.extract()
            
            text = soup.get_text(separator='\n')
            lines = [line.strip() for line in text.splitlines() if line.strip()]
            return "\n\n".join(lines)
        except Exception as e:
            raise Exception(f"Error processing XML file: {str(e)}")

    def _extract_rtf(self, file_path: Path) -> str:
        """Extract text from RTF files"""
        try:
            import striprtf.striprtf as striprtf
            
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                rtf_text = f.read()
                
            plain_text = striprtf.rtf_to_text(rtf_text)
            lines = [line.strip() for line in plain_text.splitlines() if line.strip()]
            return "\n\n".join(lines)
        except ImportError:
            raise ImportError("striprtf package is required for RTF support.")
        except Exception as e:
            raise Exception(f"Error processing RTF file: {str(e)}")

    def _extract_excel(self, file_path: Path) -> str:
        """Extract content from Excel files with enhanced processing"""
        try:
            # Use enhanced Excel processor
            processed_content = self.excel_processor.process_excel(file_path)
            
            # If processing fails, fall back to basic processing
            if not processed_content:
                logging.warning(f"Enhanced Excel processing failed for {file_path}, falling back to basic processing")
                return self._basic_excel_extract(file_path)
                
            return processed_content
            
        except Exception as e:
            logging.error(f"Error in enhanced Excel processing: {str(e)}")
            # Fall back to basic Excel processing
            return self._basic_excel_extract(file_path)

    def _basic_excel_extract(self, file_path: Path) -> str:
        """Basic Excel extraction as fallback"""
        try:
            excel_file = pd.ExcelFile(file_path)
            sheets_data = []
            
            for sheet_name in excel_file.sheet_names:
                df = pd.read_excel(excel_file, sheet_name=sheet_name)
                sheet_content = f"\nSheet: {sheet_name}\n"
                sheet_content += "=" * (len(sheet_name) + 7) + "\n"
                
                if df.empty:
                    sheet_content += "Empty Sheet\n"
                else:
                    sheet_content += df.fillna('').to_string(
                        index=False,
                        max_rows=None,
                        max_cols=None,
                        line_width=120
                    ) + "\n"
                    
                sheets_data.append(sheet_content)
            
            return "\n\n".join(sheets_data)
            
        except Exception as e:
            raise Exception(f"Error in basic Excel processing: {str(e)}")

    def _generate_metadata(
        self,
        file_path: Path,
        content: str,
        additional_metadata: Optional[Dict] = None
    ) -> Dict:
        """Generate comprehensive metadata"""
        file_stat = file_path.stat()
        
        metadata = {
            'filename': file_path.name,
            'file_type': file_path.suffix,
            'file_size': file_stat.st_size,
            'created_at': datetime.fromtimestamp(file_stat.st_ctime),
            'modified_at': datetime.fromtimestamp(file_stat.st_mtime),
            'content_hash': self._calculate_hash(content),
            'mime_type': magic.from_file(str(file_path), mime=True),
            'word_count': len(content.split()),
            'character_count': len(content),
            'processing_timestamp': datetime.now().isoformat()
        }
        
        # Add Excel-specific metadata if applicable
        if file_path.suffix.lower() in ['.xlsx', '.xls']:
            try:
                if hasattr(self.excel_processor, 'get_metadata'):
                    excel_metadata = self.excel_processor.get_metadata()
                    metadata.update({'excel_metadata': excel_metadata})
            except Exception as e:
                logging.warning(f"Could not extract Excel metadata: {str(e)}")
        
        if additional_metadata:
            metadata.update(additional_metadata)
            
        return metadata

    def _calculate_hash(self, text: str) -> str:
        """Calculate SHA-256 hash of text"""
        return hashlib.sha256(text.encode()).hexdigest()

    async def process_document(
        self,
        file_path: Union[str, Path],
        metadata: Optional[Dict] = None
    ) -> Dict:
        """Process a document with metadata and content extraction"""
        file_path = Path(file_path)
        
        if not self._validate_file(file_path):
            raise ValueError(f"Invalid file: {file_path}")

        content = self._extract_content(file_path)
        doc_metadata = self._generate_metadata(file_path, content, metadata)
        chunks = self.text_splitter.split_text(content)
        chunk_hashes = [self._calculate_hash(chunk) for chunk in chunks]
        
        return {
            'content': content,
            'chunks': chunks,
            'chunk_hashes': chunk_hashes,
            'metadata': doc_metadata,
            'statistics': self._generate_statistics(content, chunks)
        }

    def _validate_file(self, file_path: Path) -> bool:
        """Validate file type, size, and content"""
        if not file_path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")
            
        if file_path.suffix.lower() not in self.supported_formats:
            raise ValueError(f"Unsupported file format: {file_path.suffix}")
            
        if file_path.stat().st_size > self.max_file_size:
            raise ValueError(f"File too large: {file_path}")
            
        if file_path.stat().st_size == 0:
            raise ValueError(f"Empty file: {file_path}")
            
        return True

    def _generate_statistics(self, content: str, chunks: List[str]) -> Dict:
        """Generate document statistics"""
        return {
            'total_chunks': len(chunks),
            'average_chunk_size': sum(len(chunk) for chunk in chunks) / len(chunks),
            'token_estimate': len(content.split()),
            'unique_words': len(set(content.lower().split())),
            'sentences': len([s for s in content.split('.') if s.strip()]),
        }

    async def batch_process(
        self,
        file_paths: List[Union[str, Path]],
        parallel: bool = True
    ) -> Dict[str, Dict]:
        """Process multiple documents in parallel"""
        results = {}
        
        if parallel:
            threads = []
            for file_path in file_paths:
                thread = threading.Thread(
                    target=self._process_and_store,
                    args=(file_path, results)
                )
                threads.append(thread)
                thread.start()
                
            for thread in threads:
                thread.join()
        else:
            for file_path in file_paths:
                await self._process_and_store(file_path, results)
                
        return results

    async def _process_and_store(
        self,
        file_path: Union[str, Path],
        results: Dict
    ):
        """Process a single document and store results"""
        try:
            result = await self.process_document(file_path)
            results[str(file_path)] = result
        except Exception as e:
            results[str(file_path)] = {'error': str(e)}