Spaces:

eberhenriquez94
/

PDF_OCR

Build error

File size: 3,871 Bytes

1b10823
30cca7c
 
1b10823
30cca7c
 
af608ef
1b10823
d473fc1
30cca7c
d473fc1
30cca7c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7182873
d473fc1
 
b5e6996
30cca7c
 
 
 
 
 
 
 
 
 
b5e6996
30cca7c
 
 
 
 
 
 
7182873
 
 
 
30cca7c
7182873
 
 
 
 
 
b5e6996
7182873
 
 
 
b5e6996
7182873
 
b5e6996
7182873
 
 
 
30cca7c
 
 
 
b5e6996
 
30cca7c
7182873
30cca7c
 
b5e6996
 
30cca7c
 
 
b5e6996
30cca7c
 
 
b5e6996
 
30cca7c
 
 
 
 
 
b5e6996

import gradio as gr
import subprocess
import logging
from PyPDF2 import PdfReader
import tempfile
import os
import shlex

# Configuración de logs
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

def ejecutar_comando(comando):
    """Ejecuta un comando de shell y maneja errores."""
    try:
        resultado = subprocess.run(comando, shell=True, check=True, capture_output=True, text=True)
        logger.info(f"Comando ejecutado: {comando}\nSalida:\n{resultado.stdout}")
        return resultado.stdout
    except subprocess.CalledProcessError as e:
        error_message = f"Error al ejecutar el comando: {comando}\nError: {e}\nSalida de error:\n{e.stderr}"
        logger.error(error_message)
        raise RuntimeError(error_message)

def crear_pdf_con_texto_incrustado(pdf_original, archivo_salida, idioma="spa"):
    """Procesa un PDF con OCR usando OCRmyPDF."""
    try:
        comando = f"ocrmypdf -l {idioma} --force-ocr --deskew --output-type pdf {shlex.quote(pdf_original)} {shlex.quote(archivo_salida)}"
        ejecutar_comando(comando)
    except RuntimeError as e:
        raise gr.Error(f"Error al procesar el archivo con OCR: {e}")

def leer_pdf(pdf_path):
    """Extrae texto de un archivo PDF."""
    try:
        reader = PdfReader(pdf_path)
        texto = ""
        for pagina in reader.pages:
            texto += pagina.extract_text() + "\n"
        return texto.strip() or "No se pudo extraer texto del PDF."
    except Exception as e:
        logger.error(f"Error al leer el PDF: {e}")
        return f"Error al leer el PDF: {e}"

def flujo_principal(pdf_file, idioma="spa"):
    """Procesa un PDF subido, realiza OCR y extrae texto."""
    if not pdf_file:
        raise gr.Error("No se subió ningún archivo.")

    # Crear archivos temporales para el procesamiento
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_input:
        temp_input.write(pdf_file.read())
        input_pdf = temp_input.name

    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_output:
        output_pdf = temp_output.name

    texto_original = leer_pdf(input_pdf)

    try:
        # Procesar con OCR
        crear_pdf_con_texto_incrustado(input_pdf, output_pdf, idioma)
        texto_ocr = leer_pdf(output_pdf)
        return gr.File(input_pdf, label="PDF Original"), texto_original, gr.File(output_pdf, label="PDF con OCR"), texto_ocr
    except gr.Error as e:
        logger.error("Error durante el procesamiento del PDF.")
        raise e
    finally:
        # Limpiar archivos temporales
        if os.path.exists(input_pdf):
            os.remove(input_pdf)
        if os.path.exists(output_pdf):
            os.remove(output_pdf)

# Interfaz Gradio
with gr.Blocks() as interfaz:
    gr.Markdown("## Procesador OCR para PDFs")
    
    # Carga de archivo y selección de idioma
    with gr.Row():
        archivo_pdf = gr.File(label="Sube tu archivo PDF", file_types=[".pdf"])
        idioma_ocr = gr.Dropdown(["spa", "eng", "fra", "deu"], label="Idioma OCR", value="spa")
        boton_procesar = gr.Button("Procesar OCR")
    
    # Resultados del procesamiento
    with gr.Row():
        texto_original = gr.Textbox(label="Texto Original", lines=10, interactive=False)
        texto_ocr = gr.Textbox(label="Texto con OCR", lines=10, interactive=False)
    
    with gr.Row():
        pdf_original_vista = gr.File(label="Descargar PDF Original", interactive=False)
        pdf_ocr_vista = gr.File(label="Descargar PDF con OCR", interactive=False)
    
    # Conectar la lógica con la interfaz
    boton_procesar.click(
        fn=flujo_principal,
        inputs=[archivo_pdf, idioma_ocr],
        outputs=[pdf_original_vista, texto_original, pdf_ocr_vista, texto_ocr]
    )

if __name__ == "__main__":
    interfaz.launch()