Spaces:

visualpolice
/

OCR-BT-Gradio

Sleeping

App Files Files Community

Teera commited on Sep 16, 2024

Commit

3c77bf4

verified ·

1 Parent(s): bdab50e

Upload app.py

Browse files

Files changed (1) hide show

app.py +89 -91

app.py CHANGED Viewed

@@ -1,91 +1,89 @@
-import subprocess
-import json
-import os
-import gradio as gr
-from PyPDF2 import PdfReader, PdfWriter
-!pip install pandas
-# Function to split PDF into batches of 3 pages
-def split_pdf(file_path, batch_size=3):
-    pdf_reader = PdfReader(open(file_path, "rb"))
-    total_pages = len(pdf_reader.pages)
-    pdf_batches = []
-    # Split the PDF into batches of 3 pages
-    for i in range(0, total_pages, batch_size):
-        pdf_writer = PdfWriter()
-        for j in range(i, min(i + batch_size, total_pages)):
-            pdf_writer.add_page(pdf_reader.pages[j])
-        batch_path = f"./temp_batch_{i // batch_size}.pdf"
-        with open(batch_path, "wb") as batch_file:
-            pdf_writer.write(batch_file)
-        pdf_batches.append(batch_path)
-    return pdf_batches
-# Function to process the PDF batch using subprocess
-def process_pdf_batch(batch_path, output_dir):
-    # Extract the base name of the batch file
-    pdf_name = os.path.basename(batch_path).split('.')[0]
-    result_path = os.path.join(output_dir, pdf_name, "results.json")
-    # Build the OCR command
-    ocr_command = ["surya_ocr", batch_path, "--results_dir", output_dir]
-    # Run the command using subprocess
-    try:
-        result = subprocess.run(ocr_command, check=True, text=True, capture_output=True,encoding="utf-8")
-        print("OCR Command Output:", result.stdout)
-    except subprocess.CalledProcessError as e:
-        return f"OCR processing failed: {e.stderr}"
-    # After OCR processing, read the results from the JSON file
-    if os.path.exists(result_path):
-        with open(result_path, 'r', encoding="utf-8") as f:
-            data = json.load(f)
-        # Extract text from the JSON
-        result_text = ''
-        for page_data in data[pdf_name]:
-            for line in page_data['text_lines']:
-                result_text += line['text'] + '\n'
-        return result_text
-    else:
-        return "OCR processing completed, but result file not found."
-# Main function to process the entire PDF in batches
-def process_pdf(file):
-    # Define output directory
-    output_dir = "./result"
-    # Split the uploaded PDF into batches of 3 pages
-    pdf_batches = split_pdf(file.name, batch_size=3)
-    # Process each batch and accumulate results
-    final_text = ""
-    for batch_path in pdf_batches:
-        batch_result = process_pdf_batch(batch_path, output_dir)
-        final_text += batch_result + "\n"
-    return final_text
-# Define Gradio interface
-def process_pdf_gradio(file):
-    # Gradio handles the file upload differently, so process accordingly
-    result = process_pdf(file)
-    return result
-# Gradio app
-app = gr.Interface(
-    fn=process_pdf_gradio,
-    inputs=gr.File(label="Upload PDF"),
-    outputs=gr.Textbox(label="Extracted Text"),
-    title="PDF OCR Extractor"
-)
-# Launch the app with a specified port for Docker
-app.launch(server_name="0.0.0.0", server_port=7860, share=True)

+import subprocess
+import json
+import os
+import gradio as gr
+from PyPDF2 import PdfReader, PdfWriter
+# Function to split PDF into batches of 3 pages
+def split_pdf(file_path, batch_size=3):
+    pdf_reader = PdfReader(open(file_path, "rb"))
+    total_pages = len(pdf_reader.pages)
+    pdf_batches = []
+    # Split the PDF into batches of 3 pages
+    for i in range(0, total_pages, batch_size):
+        pdf_writer = PdfWriter()
+        for j in range(i, min(i + batch_size, total_pages)):
+            pdf_writer.add_page(pdf_reader.pages[j])
+        batch_path = f"./temp_batch_{i // batch_size}.pdf"
+        with open(batch_path, "wb") as batch_file:
+            pdf_writer.write(batch_file)
+        pdf_batches.append(batch_path)
+    return pdf_batches
+# Function to process the PDF batch using subprocess
+def process_pdf_batch(batch_path, output_dir):
+    # Extract the base name of the batch file
+    pdf_name = os.path.basename(batch_path).split('.')[0]
+    result_path = os.path.join(output_dir, pdf_name, "results.json")
+    # Build the OCR command
+    ocr_command = ["surya_ocr", batch_path, "--results_dir", output_dir]
+    # Run the command using subprocess
+    try:
+        result = subprocess.run(ocr_command, check=True, text=True, capture_output=True,encoding="utf-8")
+        print("OCR Command Output:", result.stdout)
+    except subprocess.CalledProcessError as e:
+        return f"OCR processing failed: {e.stderr}"
+    # After OCR processing, read the results from the JSON file
+    if os.path.exists(result_path):
+        with open(result_path, 'r', encoding="utf-8") as f:
+            data = json.load(f)
+        # Extract text from the JSON
+        result_text = ''
+        for page_data in data[pdf_name]:
+            for line in page_data['text_lines']:
+                result_text += line['text'] + '\n'
+        return result_text
+    else:
+        return "OCR processing completed, but result file not found."
+# Main function to process the entire PDF in batches
+def process_pdf(file):
+    # Define output directory
+    output_dir = "./result"
+    # Split the uploaded PDF into batches of 3 pages
+    pdf_batches = split_pdf(file.name, batch_size=3)
+    # Process each batch and accumulate results
+    final_text = ""
+    for batch_path in pdf_batches:
+        batch_result = process_pdf_batch(batch_path, output_dir)
+        final_text += batch_result + "\n"
+    return final_text
+# Define Gradio interface
+def process_pdf_gradio(file):
+    # Gradio handles the file upload differently, so process accordingly
+    result = process_pdf(file)
+    return result
+# Gradio app
+app = gr.Interface(
+    fn=process_pdf_gradio,
+    inputs=gr.File(label="Upload PDF"),
+    outputs=gr.Textbox(label="Extracted Text"),
+    title="PDF OCR Extractor"
+)
+# Launch the app with a specified port for Docker
+app.launch(server_name="0.0.0.0", server_port=7860, share=True)