Spaces:
Sleeping
Sleeping
import subprocess | |
import json | |
import os | |
import gradio as gr | |
from PyPDF2 import PdfReader, PdfWriter | |
# Function to split PDF into batches of 3 pages | |
def split_pdf(file_path, batch_size=3): | |
pdf_reader = PdfReader(open(file_path, "rb")) | |
total_pages = len(pdf_reader.pages) | |
pdf_batches = [] | |
# Split the PDF into batches of 3 pages | |
for i in range(0, total_pages, batch_size): | |
pdf_writer = PdfWriter() | |
for j in range(i, min(i + batch_size, total_pages)): | |
pdf_writer.add_page(pdf_reader.pages[j]) | |
batch_path = f"./temp_batch_{i // batch_size}.pdf" | |
with open(batch_path, "wb") as batch_file: | |
pdf_writer.write(batch_file) | |
pdf_batches.append(batch_path) | |
return pdf_batches | |
# Function to process the PDF batch using subprocess | |
def process_pdf_batch(batch_path, output_dir): | |
# Extract the base name of the batch file | |
pdf_name = os.path.basename(batch_path).split('.')[0] | |
result_path = os.path.join(output_dir, pdf_name, "results.json") | |
# Build the OCR command | |
ocr_command = ["surya_ocr", batch_path, "--results_dir", output_dir] | |
# Run the command using subprocess | |
try: | |
result = subprocess.run(ocr_command, check=True, text=True, capture_output=True,encoding="utf-8") | |
print("OCR Command Output:", result.stdout) | |
except subprocess.CalledProcessError as e: | |
return f"OCR processing failed: {e.stderr}" | |
# After OCR processing, read the results from the JSON file | |
if os.path.exists(result_path): | |
with open(result_path, 'r', encoding="utf-8") as f: | |
data = json.load(f) | |
# Extract text from the JSON | |
result_text = '' | |
for page_data in data[pdf_name]: | |
for line in page_data['text_lines']: | |
result_text += line['text'] + '\n' | |
return result_text | |
else: | |
return "OCR processing completed, but result file not found." | |
# Main function to process the entire PDF in batches | |
def process_pdf(file): | |
# Define output directory | |
output_dir = "./result" | |
# Split the uploaded PDF into batches of 3 pages | |
pdf_batches = split_pdf(file.name, batch_size=3) | |
# Process each batch and accumulate results | |
final_text = "" | |
for batch_path in pdf_batches: | |
batch_result = process_pdf_batch(batch_path, output_dir) | |
final_text += batch_result + "\n" | |
return final_text | |
# Define Gradio interface | |
def process_pdf_gradio(file): | |
# Gradio handles the file upload differently, so process accordingly | |
result = process_pdf(file) | |
return result | |
# Gradio app | |
app = gr.Interface( | |
fn=process_pdf_gradio, | |
inputs=gr.File(label="Upload PDF"), | |
outputs=gr.Textbox(label="Extracted Text"), | |
title="PDF OCR Extractor" | |
) | |
# Launch the app with a specified port for Docker | |
app.launch(server_name="0.0.0.0", server_port=7860, share=True) | |