OCR-BT-Gradio / app.py
Teera's picture
Upload app.py
3c77bf4 verified
import subprocess
import json
import os
import gradio as gr
from PyPDF2 import PdfReader, PdfWriter
# Function to split PDF into batches of 3 pages
def split_pdf(file_path, batch_size=3):
pdf_reader = PdfReader(open(file_path, "rb"))
total_pages = len(pdf_reader.pages)
pdf_batches = []
# Split the PDF into batches of 3 pages
for i in range(0, total_pages, batch_size):
pdf_writer = PdfWriter()
for j in range(i, min(i + batch_size, total_pages)):
pdf_writer.add_page(pdf_reader.pages[j])
batch_path = f"./temp_batch_{i // batch_size}.pdf"
with open(batch_path, "wb") as batch_file:
pdf_writer.write(batch_file)
pdf_batches.append(batch_path)
return pdf_batches
# Function to process the PDF batch using subprocess
def process_pdf_batch(batch_path, output_dir):
# Extract the base name of the batch file
pdf_name = os.path.basename(batch_path).split('.')[0]
result_path = os.path.join(output_dir, pdf_name, "results.json")
# Build the OCR command
ocr_command = ["surya_ocr", batch_path, "--results_dir", output_dir]
# Run the command using subprocess
try:
result = subprocess.run(ocr_command, check=True, text=True, capture_output=True,encoding="utf-8")
print("OCR Command Output:", result.stdout)
except subprocess.CalledProcessError as e:
return f"OCR processing failed: {e.stderr}"
# After OCR processing, read the results from the JSON file
if os.path.exists(result_path):
with open(result_path, 'r', encoding="utf-8") as f:
data = json.load(f)
# Extract text from the JSON
result_text = ''
for page_data in data[pdf_name]:
for line in page_data['text_lines']:
result_text += line['text'] + '\n'
return result_text
else:
return "OCR processing completed, but result file not found."
# Main function to process the entire PDF in batches
def process_pdf(file):
# Define output directory
output_dir = "./result"
# Split the uploaded PDF into batches of 3 pages
pdf_batches = split_pdf(file.name, batch_size=3)
# Process each batch and accumulate results
final_text = ""
for batch_path in pdf_batches:
batch_result = process_pdf_batch(batch_path, output_dir)
final_text += batch_result + "\n"
return final_text
# Define Gradio interface
def process_pdf_gradio(file):
# Gradio handles the file upload differently, so process accordingly
result = process_pdf(file)
return result
# Gradio app
app = gr.Interface(
fn=process_pdf_gradio,
inputs=gr.File(label="Upload PDF"),
outputs=gr.Textbox(label="Extracted Text"),
title="PDF OCR Extractor"
)
# Launch the app with a specified port for Docker
app.launch(server_name="0.0.0.0", server_port=7860, share=True)