Spaces:

eberhenriquez94
/

PDF_OCR

Build error

App Files Files Community

eberhenriquez94 commited on Dec 1, 2024

Commit

30cca7c

verified ·

1 Parent(s): 1390186

a

Browse files

Files changed (1) hide show

app.py +94 -28

app.py CHANGED Viewed

@@ -1,31 +1,97 @@
 import gradio as gr
 from PyPDF2 import PdfReader
-# Función para extraer texto de un PDF
-def extract_text_from_pdf(pdf_path):
-    """
-    Extrae el texto de un archivo PDF dado su path.
-    """
-    reader = PdfReader(pdf_path)
-    text = "".join([page.extract_text() for page in reader.pages])
-    return text.strip()
-# Función principal para procesar PDFs
-def process_pdf(file):
-    """
-    Procesa el archivo PDF cargado y extrae el texto.
-    """
-    text = extract_text_from_pdf(file.name)  # Usamos la ruta del archivo directamente
-    return text
-# Configurar la interfaz de Gradio
-with gr.Blocks() as demo:
-    gr.Markdown("# Procesador de PDFs en Hugging Face Space")
-    pdf_file = gr.File(label="Carga tu PDF", file_types=[".pdf"])
-    text_output = gr.Textbox(label="Texto Extraído", lines=10)
-    process_button = gr.Button("Procesar PDF")
-    process_button.click(process_pdf, inputs=[pdf_file], outputs=[text_output])
-# Lanzar la aplicación
-demo.launch()

 import gradio as gr
+import subprocess
+import logging
 from PyPDF2 import PdfReader
+import tempfile
+import os
+# Configuración de logs (mejor usar un logger)
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+def ejecutar_comando(comando):
+    """Ejecuta un comando de shell y maneja errores."""
+    try:
+        resultado = subprocess.run(comando, shell=True, check=True, capture_output=True, text=True)
+        logger.info(f"Comando ejecutado: {comando}\nSalida:\n{resultado.stdout}")
+        return resultado.stdout
+    except subprocess.CalledProcessError as e:
+        error_message = f"Error al ejecutar el comando: {comando}\nError: {e}\nSalida de error:\n{e.stderr}"
+        logger.error(error_message)
+        raise RuntimeError(error_message)
+def crear_pdf_con_texto_incrustado(pdf_original, archivo_salida, idioma="spa"):
+    """Procesa un PDF con OCR usando OCRmyPDF."""
+    try:
+        ejecutar_comando(
+            f"ocrmypdf -l {idioma} --force-ocr --deskew --clean --output-type pdf {pdf_original} {archivo_salida}"
+        )
+    except RuntimeError as e:  # Capturar el error específico
+        raise gr.Error(str(e))  # Mostrar el error en la interfaz de Gradio
+def leer_pdf(pdf_path):
+    """Extrae texto de un archivo PDF."""
+    try:
+        reader = PdfReader(pdf_path)
+        texto = ""
+        for pagina in reader.pages:
+            texto += pagina.extract_text() + "\n"
+        return texto.strip() or "No se pudo extraer texto del PDF."
+    except Exception as e:
+        return f"Error al leer el PDF: {e}"
+def flujo_principal(pdf_file, idioma="spa"):
+    """Procesa un PDF subido, realiza OCR y extrae texto."""
+    if not pdf_file:
+        raise gr.Error("No se subió ningún archivo.")
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_input:
+        temp_input.write(pdf_file.read())
+        input_pdf = temp_input.name
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_output:
+        output_pdf = temp_output.name
+    texto_original = leer_pdf(input_pdf)
+    try:
+        crear_pdf_con_texto_incrustado(input_pdf, output_pdf, idioma)
+        texto_ocr = leer_pdf(output_pdf)
+        return gr.File(input_pdf, label="PDF Original"), texto_original, gr.File(output_pdf, label="PDF con OCR"), texto_ocr
+    except gr.Error as e:
+         # Limpieza en caso de error
+        os.remove(input_pdf)
+        if os.path.exists(output_pdf):
+            os.remove(output_pdf)
+        raise e # Re-lanzar la excepción para que Gradio la maneje
+    finally: # Limpieza, incluso si hay excepciones
+        if os.path.exists(input_pdf): os.remove(input_pdf)
+        if os.path.exists(output_pdf): os.remove(output_pdf)
+# Interfaz Gradio
+with gr.Blocks() as interfaz:
+    gr.Markdown("## Procesador OCR para PDFs")
+    with gr.Row():
+        archivo_pdf = gr.File(label="Sube tu archivo PDF")
+        idioma_ocr = gr.Dropdown(["spa", "eng", "fra", "deu"], label="Idioma OCR", value="spa")
+        boton_procesar = gr.Button("Procesar OCR")
+    with gr.Row():
+        texto_original = gr.Textbox(label="Texto Original", lines=10, interactive=False)
+        texto_ocr = gr.Textbox(label="Texto con OCR", lines=10, interactive=False)
+    with gr.Row():
+        pdf_original_vista = gr.File(label="Descargar PDF Original", interactive=False)
+        pdf_ocr_vista = gr.File(label="Descargar PDF con OCR", interactive=False)
+    boton_procesar.click(
+        fn=flujo_principal,
+        inputs=[archivo_pdf, idioma_ocr],
+        outputs=[pdf_original_vista, texto_original, pdf_ocr_vista, texto_ocr]
+    )
+interfaz.launch(share=True)