Spaces:

Overglitch
/

document-summarizer

Sleeping

File size: 6,382 Bytes

import os
import re
import shutil
import time
from pathlib import Path
from datetime import date
from cleantext import clean
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
from spellchecker import SpellChecker
import nltk

nltk.data.path.append('/home/user/nltk_data')
nltk.download('punkt')
nltk.download('punkt_tab')


class Preprocessor:
    """Clase para preprocesar texto, realizar limpieza y correcciones."""

    def __init__(self):
        self.spell_checker = SpellChecker()

    @staticmethod
    def clean_text(text: str, lower: bool = False, lang: str = "en") -> str:
        """
        Limpia texto de ruido y caracteres no deseados.
        """
        return clean(
            text,
            fix_unicode=True,
            to_ascii=True,
            lower=lower,
            no_line_breaks=True,
            no_urls=True,
            no_emails=True,
            no_phone_numbers=True,
            no_numbers=False,
            no_digits=False,
            no_currency_symbols=True,
            no_punct=False,
            lang=lang,
        )

    @staticmethod
    def correct_spacing(text: str, exceptions=None) -> str:
        """
        Corrige espacios alrededor de signos de puntuación y excepciones.
        """
        if exceptions is None:
            exceptions = ["e.g.", "i.e.", "etc.", "cf.", "vs.", "p."]

        text = re.sub(r"\s+", " ", text)
        text = re.sub(r'\s([?.!"](?:\s|$))', r"\1", text)
        text = re.sub(r"\s,", r",", text)

        for exception in exceptions:
            text = text.replace(" ".join(exception.split()), exception)

        return text.strip()

    @staticmethod
    def split_into_sentences(text: str) -> list:
        """
        Divide texto en oraciones usando NLTK.
        """
        from nltk.tokenize import sent_tokenize
        return sent_tokenize(text)

    def correct_spelling(self, text: str) -> str:
        """
        Corrige la ortografía del texto dado.
        """
        words = text.split()
        corrected_words = [self.spell_checker.correction(word) for word in words]
        return " ".join(corrected_words)

    def preprocess_text(self, text: str) -> str:
        """
        Limpia, corrige ortografía y ajusta espacios en texto.
        """
        cleaned = self.clean_text(text)
        corrected = self.correct_spelling(cleaned)
        return self.correct_spacing(corrected)
    
    def clean_sentences(self, sentences: list) -> list:
        """
        Limpia cada oración en una lista de oraciones.
        """
        return [self.clean_text(sentence) for sentence in sentences]


class PDFProcessor:
    """Clase para procesar archivos PDF y convertirlos a texto."""

    def __init__(self, max_pages=20):
        self.ocr_model = ocr_predictor(pretrained=True)
        self.max_pages = max_pages

    def pdf_to_text(self, pdf_path):
        """
        Convierte un archivo PDF a texto usando OCR.
        Si el archivo no es válido o está corrupto, maneja la excepción.
        """
        # Asegurarse de que el archivo está disponible temporalmente
        temp_dir = Path("temp")
        temp_dir.mkdir(exist_ok=True)
    
        # Manejar el archivo temporalmente
        temp_file_path = temp_dir / Path(pdf_path).name
        if not temp_file_path.exists():
            shutil.copy(pdf_path, temp_file_path)
    
        try:
            # 1) Cargar el PDF
            doc = DocumentFile.from_pdf(temp_file_path)
    
            # Verificar si el documento tiene páginas válidas
            if not hasattr(doc, "pages") or not doc.pages:
                raise ValueError("El archivo no contiene páginas procesables.")
    
            # 2) Limitar el número de páginas si es necesario
            if len(doc.pages) > self.max_pages:
                doc.pages = doc.pages[:self.max_pages]
    
            # 3) Aplicar el modelo OCR al documento
            ocr_result = self.ocr_model(doc)
    
            # 4) Extraer texto de cada bloque
            text_pages = []
            for page in ocr_result.pages:
                for block in page.blocks:
                    text_pages.append(block.text)
    
            # 5) Unir todo el texto y devolverlo
            return "\n".join(text_pages)
    
        except ValueError as ve:
            print(f"Error al procesar el archivo PDF: {ve}")
            return "El archivo PDF no es válido o está corrupto."
    
        except Exception as e:
            print(f"Error inesperado al procesar el PDF: {e}")
            return "Ocurrió un error inesperado al procesar el archivo."
    
        finally:
            # Limpiar el archivo temporal después del procesamiento
            if temp_file_path.exists():
                temp_file_path.unlink()


    @staticmethod
    def clear_temp_directory():
        """Limpia todos los archivos en el directorio temporal."""
        temp_dir = Path("temp")
        if temp_dir.exists():
            for file in temp_dir.iterdir():
                if file.is_file():
                    file.unlink()


class FileHandler:
    """Clase para manejar archivos temporales y limpieza."""

    @staticmethod
    def save_temp_file(file_obj, temp_dir: Path = None) -> str:
        """
        Guarda un archivo temporalmente y retorna su ruta.
        """
        if temp_dir is None:
            temp_dir = Path("temp")
        temp_dir.mkdir(exist_ok=True)

        file_path = Path(file_obj.name)
        temp_path = temp_dir / file_path.name

        with open(temp_path, "wb") as f:
            f.write(file_obj.read())
        return str(temp_path.resolve())

    @staticmethod
    def clear_temp_files(directory="temp", name_contains="RESULT_"):
        """
        Limpia archivos temporales en el directorio especificado.
        """
        temp_dir = Path(directory)
        if not temp_dir.exists():
            return

        for file in temp_dir.iterdir():
            if file.is_file() and name_contains in file.name:
                file.unlink()

    @staticmethod
    def move_to_completed(from_dir: Path, filename: str, completed_dir="completed"):
        """
        Mueve un archivo procesado a la carpeta 'completed'.
        """
        completed_path = from_dir / completed_dir
        completed_path.mkdir(exist_ok=True)
        shutil.move(from_dir / filename, completed_path / filename)