Spaces:

TalatMasud
/

chatbot-backend

Running

File size: 9,130 Bytes

e87abff

# src/utils/document_processor.py
from typing import List, Dict, Optional, Union
import PyPDF2
import docx
import pandas as pd
import json
from pathlib import Path
import hashlib
import magic  # python-magic library for file type detection
from bs4 import BeautifulSoup
import requests
import csv
from datetime import datetime
import threading
from queue import Queue
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter

class DocumentProcessor:
    def __init__(
        self,
        chunk_size: int = 1000,
        chunk_overlap: int = 200,
        max_file_size: int = 10 * 1024 * 1024,  # 10MB
        supported_formats: Optional[List[str]] = None
    ):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.max_file_size = max_file_size
        self.supported_formats = supported_formats or [
            '.txt', '.pdf', '.docx', '.csv', '.json', 
            '.html', '.md', '.xml', '.rtf'
        ]
        self.processing_queue = Queue()
        self.processed_docs = {}
        self._initialize_text_splitter()

    def _initialize_text_splitter(self):
        """Initialize the text splitter with custom settings"""
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
            length_function=len,
            separators=["\n\n", "\n", " ", ""]
        )

    async def process_document(
        self,
        file_path: Union[str, Path],
        metadata: Optional[Dict] = None
    ) -> Dict:
        """
        Process a document with metadata and content extraction
        """
        file_path = Path(file_path)
        
        # Basic validation
        if not self._validate_file(file_path):
            raise ValueError(f"Invalid file: {file_path}")

        # Extract content based on file type
        content = self._extract_content(file_path)
        
        # Generate document metadata
        doc_metadata = self._generate_metadata(file_path, content, metadata)
        
        # Split content into chunks
        chunks = self.text_splitter.split_text(content)
        
        # Calculate embeddings chunk hashes
        chunk_hashes = [self._calculate_hash(chunk) for chunk in chunks]
        
        return {
            'content': content,
            'chunks': chunks,
            'chunk_hashes': chunk_hashes,
            'metadata': doc_metadata,
            'statistics': self._generate_statistics(content, chunks)
        }

    def _validate_file(self, file_path: Path) -> bool:
        """
        Validate file type, size, and content
        """
        if not file_path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")
            
        if file_path.suffix.lower() not in self.supported_formats:
            raise ValueError(f"Unsupported file format: {file_path.suffix}")
            
        if file_path.stat().st_size > self.max_file_size:
            raise ValueError(f"File too large: {file_path}")
            
        # Check if file is not empty
        if file_path.stat().st_size == 0:
            raise ValueError(f"Empty file: {file_path}")
            
        return True

    def _extract_content(self, file_path: Path) -> str:
        """
        Extract content from different file formats
        """
        suffix = file_path.suffix.lower()
        
        try:
            if suffix == '.pdf':
                return self._extract_pdf(file_path)
            elif suffix == '.docx':
                return self._extract_docx(file_path)
            elif suffix == '.csv':
                return self._extract_csv(file_path)
            elif suffix == '.json':
                return self._extract_json(file_path)
            elif suffix == '.html':
                return self._extract_html(file_path)
            elif suffix == '.txt':
                return file_path.read_text(encoding='utf-8')
            else:
                raise ValueError(f"Unsupported format: {suffix}")
        except Exception as e:
            raise Exception(f"Error extracting content from {file_path}: {str(e)}")

    def _extract_pdf(self, file_path: Path) -> str:
        """Extract text from PDF with advanced features"""
        text = ""
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            metadata = reader.metadata
            
            for page in reader.pages:
                text += page.extract_text() + "\n\n"
                
                # Extract images if available
                if '/XObject' in page['/Resources']:
                    for obj in page['/Resources']['/XObject'].get_object():
                        if page['/Resources']['/XObject'][obj]['/Subtype'] == '/Image':
                            # Process images if needed
                            pass
                            
        return text.strip()

    def _extract_docx(self, file_path: Path) -> str:
        """Extract text from DOCX with formatting"""
        doc = docx.Document(file_path)
        full_text = []
        
        for para in doc.paragraphs:
            full_text.append(para.text)
            
        # Extract tables if present
        for table in doc.tables:
            for row in table.rows:
                row_text = [cell.text for cell in row.cells]
                full_text.append(" | ".join(row_text))
                
        return "\n\n".join(full_text)

    def _extract_csv(self, file_path: Path) -> str:
        """Convert CSV to structured text"""
        df = pd.read_csv(file_path)
        return df.to_string()

    def _extract_json(self, file_path: Path) -> str:
        """Convert JSON to readable text"""
        with open(file_path) as f:
            data = json.load(f)
        return json.dumps(data, indent=2)

    def _extract_html(self, file_path: Path) -> str:
        """Extract text from HTML with structure preservation"""
        with open(file_path) as f:
            soup = BeautifulSoup(f, 'html.parser')
            
        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.decompose()
            
        text = soup.get_text(separator='\n')
        lines = [line.strip() for line in text.splitlines() if line.strip()]
        return "\n\n".join(lines)

    def _generate_metadata(
        self,
        file_path: Path,
        content: str,
        additional_metadata: Optional[Dict] = None
    ) -> Dict:
        """Generate comprehensive metadata"""
        file_stat = file_path.stat()
        
        metadata = {
            'filename': file_path.name,
            'file_type': file_path.suffix,
            'file_size': file_stat.st_size,
            'created_at': datetime.fromtimestamp(file_stat.st_ctime),
            'modified_at': datetime.fromtimestamp(file_stat.st_mtime),
            'content_hash': self._calculate_hash(content),
            'mime_type': magic.from_file(str(file_path), mime=True),
            'word_count': len(content.split()),
            'character_count': len(content),
            'processing_timestamp': datetime.now().isoformat()
        }
        
        if additional_metadata:
            metadata.update(additional_metadata)
            
        return metadata

    def _generate_statistics(self, content: str, chunks: List[str]) -> Dict:
        """Generate document statistics"""
        return {
            'total_chunks': len(chunks),
            'average_chunk_size': sum(len(chunk) for chunk in chunks) / len(chunks),
            'token_estimate': len(content.split()),
            'unique_words': len(set(content.lower().split())),
            'sentences': len([s for s in content.split('.') if s.strip()]),
        }

    def _calculate_hash(self, text: str) -> str:
        """Calculate SHA-256 hash of text"""
        return hashlib.sha256(text.encode()).hexdigest()

    async def batch_process(
        self,
        file_paths: List[Union[str, Path]],
        parallel: bool = True
    ) -> Dict[str, Dict]:
        """
        Process multiple documents in parallel
        """
        results = {}
        
        if parallel:
            threads = []
            for file_path in file_paths:
                thread = threading.Thread(
                    target=self._process_and_store,
                    args=(file_path, results)
                )
                threads.append(thread)
                thread.start()
                
            for thread in threads:
                thread.join()
        else:
            for file_path in file_paths:
                await self._process_and_store(file_path, results)
                
        return results

    async def _process_and_store(
        self,
        file_path: Union[str, Path],
        results: Dict
    ):
        """Process a single document and store results"""
        try:
            result = await self.process_document(file_path)
            results[str(file_path)] = result
        except Exception as e:
            results[str(file_path)] = {'error': str(e)}