Spaces:

eberhenriquez94
/

PDF_OCR

Build error

App Files Files Community

eberhenriquez94 commited on Dec 1, 2024

Commit

7182873

verified ·

1 Parent(s): 0db9ac2

a

Browse files

Files changed (1) hide show

app.py +30 -9

app.py CHANGED Viewed

@@ -25,7 +25,7 @@ def crear_pdf_con_texto_incrustado(pdf_original, archivo_salida, idioma="spa"):
     """Procesa un PDF con OCR usando OCRmyPDF."""
     try:
         # Usa shlex.quote para manejar espacios en los nombres de archivo
-        comando = f"ocrmypdf -l {idioma} --force-ocr --deskew --clean --output-type pdf {shlex.quote(pdf_original)} {shlex.quote(archivo_salida)}"
         ejecutar_comando(comando)
     except RuntimeError as e:
         raise gr.Error(str(e))  # Mostrar el error en la interfaz de Gradio
@@ -46,19 +46,40 @@ def flujo_principal(pdf_file, idioma="spa"):
     if not pdf_file:
         raise gr.Error("No se subió ningún archivo.")
-    # Guardar el archivo subido en el directorio /tmp
-    temp_dir = tempfile.mkdtemp()
-    input_pdf_path = os.path.join(temp_dir, pdf_file.name)
-    # Usar pdf_file.name para acceder al contenido del archivo
-    with open(input_pdf_path, "wb") as f:
-        f.write(open(pdf_file.name, "rb").read())
 # Interfaz Gradio
 with gr.Blocks() as interfaz:
     gr.Markdown("## Procesador OCR para PDFs")
     with gr.Row():
-        archivo_pdf = gr.File(label="Sube tu archivo PDF")
         idioma_ocr = gr.Dropdown(["spa", "eng", "fra", "deu"], label="Idioma OCR", value="spa")
         boton_procesar = gr.Button("Procesar OCR")
     with gr.Row():
@@ -73,4 +94,4 @@ with gr.Blocks() as interfaz:
         outputs=[pdf_original_vista, texto_original, pdf_ocr_vista, texto_ocr]
     )
-interfaz.launch()

     """Procesa un PDF con OCR usando OCRmyPDF."""
     try:
         # Usa shlex.quote para manejar espacios en los nombres de archivo
+        comando = f"ocrmypdf -l {idioma} --force-ocr --deskew --output-type pdf {shlex.quote(pdf_original)} {shlex.quote(archivo_salida)}"
         ejecutar_comando(comando)
     except RuntimeError as e:
         raise gr.Error(str(e))  # Mostrar el error en la interfaz de Gradio
     if not pdf_file:
         raise gr.Error("No se subió ningún archivo.")
+    # Crear archivos temporales para el procesamiento
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_input:
+        temp_input.write(pdf_file.read())
+        input_pdf = temp_input.name
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_output:
+        output_pdf = temp_output.name
+    # Extraer texto original del PDF
+    texto_original = leer_pdf(input_pdf)
+    # Procesar el PDF con OCR
+    try:
+        crear_pdf_con_texto_incrustado(input_pdf, output_pdf, idioma)
+        texto_ocr = leer_pdf(output_pdf)
+        return gr.File(input_pdf, label="PDF Original"), texto_original, gr.File(output_pdf, label="PDF con OCR"), texto_ocr
+    except gr.Error as e:
+        if os.path.exists(input_pdf):
+            os.remove(input_pdf)
+        if os.path.exists(output_pdf):
+            os.remove(output_pdf)
+        raise e
+    finally:
+        # Limpieza de archivos temporales
+        if os.path.exists(input_pdf):
+            os.remove(input_pdf)
+        if os.path.exists(output_pdf):
+            os.remove(output_pdf)
 # Interfaz Gradio
 with gr.Blocks() as interfaz:
     gr.Markdown("## Procesador OCR para PDFs")
     with gr.Row():
+        archivo_pdf = gr.File(label="Sube tu archivo PDF", file_types=[".pdf"])
         idioma_ocr = gr.Dropdown(["spa", "eng", "fra", "deu"], label="Idioma OCR", value="spa")
         boton_procesar = gr.Button("Procesar OCR")
     with gr.Row():
         outputs=[pdf_original_vista, texto_original, pdf_ocr_vista, texto_ocr]
     )
+interfaz.launch()