Spaces:

visualpolice
/

OCR-BT-Gradio

Sleeping

App Files Files Community

OCR-BT-Gradio / app.py

Teera

Upload app.py

3c77bf4 verified 7 months ago

raw

history blame contribute delete

3.05 kB

	import subprocess
	import json
	import os
	import gradio as gr
	from PyPDF2 import PdfReader, PdfWriter

	# Function to split PDF into batches of 3 pages
	def split_pdf(file_path, batch_size=3):
	pdf_reader = PdfReader(open(file_path, "rb"))
	total_pages = len(pdf_reader.pages)
	pdf_batches = []

	# Split the PDF into batches of 3 pages
	for i in range(0, total_pages, batch_size):
	pdf_writer = PdfWriter()
	for j in range(i, min(i + batch_size, total_pages)):
	pdf_writer.add_page(pdf_reader.pages[j])

	batch_path = f"./temp_batch_{i // batch_size}.pdf"
	with open(batch_path, "wb") as batch_file:
	pdf_writer.write(batch_file)

	pdf_batches.append(batch_path)

	return pdf_batches

	# Function to process the PDF batch using subprocess
	def process_pdf_batch(batch_path, output_dir):
	# Extract the base name of the batch file
	pdf_name = os.path.basename(batch_path).split('.')[0]
	result_path = os.path.join(output_dir, pdf_name, "results.json")

	# Build the OCR command
	ocr_command = ["surya_ocr", batch_path, "--results_dir", output_dir]

	# Run the command using subprocess
	try:
	result = subprocess.run(ocr_command, check=True, text=True, capture_output=True,encoding="utf-8")
	print("OCR Command Output:", result.stdout)
	except subprocess.CalledProcessError as e:
	return f"OCR processing failed: {e.stderr}"

	# After OCR processing, read the results from the JSON file
	if os.path.exists(result_path):
	with open(result_path, 'r', encoding="utf-8") as f:
	data = json.load(f)

	# Extract text from the JSON
	result_text = ''
	for page_data in data[pdf_name]:
	for line in page_data['text_lines']:
	result_text += line['text'] + '\n'

	return result_text
	else:
	return "OCR processing completed, but result file not found."

	# Main function to process the entire PDF in batches
	def process_pdf(file):
	# Define output directory
	output_dir = "./result"

	# Split the uploaded PDF into batches of 3 pages
	pdf_batches = split_pdf(file.name, batch_size=3)

	# Process each batch and accumulate results
	final_text = ""
	for batch_path in pdf_batches:
	batch_result = process_pdf_batch(batch_path, output_dir)
	final_text += batch_result + "\n"

	return final_text

	# Define Gradio interface
	def process_pdf_gradio(file):
	# Gradio handles the file upload differently, so process accordingly
	result = process_pdf(file)
	return result

	# Gradio app
	app = gr.Interface(
	fn=process_pdf_gradio,
	inputs=gr.File(label="Upload PDF"),
	outputs=gr.Textbox(label="Extracted Text"),
	title="PDF OCR Extractor"
	)

	# Launch the app with a specified port for Docker
	app.launch(server_name="0.0.0.0", server_port=7860, share=True)