Spaces:

DeepDiveDev
/

TransformoDocs-Demo

Sleeping

App Files Files Community

TransformoDocs-Demo / app.py

DeepDiveDev

Update app.py

13a649f verified 4 months ago

raw

history blame

3.74 kB

	import gradio as gr
	import pytesseract
	from PIL import Image
	import pdf2image
	import tempfile
	import os
	import cv2
	import numpy as np

	# You may need to set the path to tesseract executable if it's not in PATH
	# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' # For Windows
	# For Linux/Mac, ensure Tesseract is installed

	def preprocess_image(img):
	"""Preprocess image to improve OCR accuracy for handwritten text"""
	# Convert to grayscale
	img_array = np.array(img)

	# Check if the image is already grayscale
	if len(img_array.shape) == 2:
	gray = img_array
	else:
	gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)

	# Apply thresholding
	_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

	# Noise removal
	kernel = np.ones((1, 1), np.uint8)
	binary = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
	binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)

	# Invert back
	binary = 255 - binary

	return Image.fromarray(binary)

	def extract_text_from_image(img):
	"""Extract text from an image using OCR"""
	# Preprocess for better handwriting recognition
	processed_img = preprocess_image(img)

	# Use pytesseract with configuration optimized for handwritten text
	custom_config = r'--oem 3 --psm 6 -l eng -c tessedit_char_whitelist="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?@#$%^&*()-+=_:;\'\" "'
	text = pytesseract.image_to_string(processed_img, config=custom_config)

	return text.strip()

	def extract_text_from_pdf(pdf_path):
	"""Extract text from all pages of a PDF file"""
	# Convert PDF to images
	with tempfile.TemporaryDirectory() as path:
	images = pdf2image.convert_from_path(pdf_path, output_folder=path)

	# Extract text from each page
	full_text = []
	for img in images:
	text = extract_text_from_image(img)
	full_text.append(text)

	return "\n\n--- Page Break ---\n\n".join(full_text)

	def process_file(file):
	"""Process the uploaded file (PDF or image)"""
	if file is None:
	return "No file uploaded. Please upload an image or PDF file."

	file_extension = os.path.splitext(file.name)[1].lower()

	if file_extension == ".pdf":
	# Process PDF
	return extract_text_from_pdf(file.name)
	elif file_extension in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif"]:
	# Process Image
	img = Image.open(file.name)
	return extract_text_from_image(img)
	else:
	return "Unsupported file format. Please upload a PDF or image file (JPG, PNG, BMP, TIFF)."

	# Create Gradio interface
	with gr.Blocks(title="Handwritten Text OCR Extractor") as app:
	gr.Markdown("# Handwritten Text OCR Extraction Tool")
	gr.Markdown("Upload an image or PDF containing handwritten text to extract the content.")

	with gr.Row():
	with gr.Column():
	file_input = gr.File(label="Upload Image or PDF")
	extract_button = gr.Button("Extract Text")

	with gr.Column():
	text_output = gr.Textbox(label="Extracted Text", lines=10, placeholder="Extracted text will appear here...")

	extract_button.click(fn=process_file, inputs=[file_input], outputs=[text_output])

	gr.Markdown("### Notes:")
	gr.Markdown("- For best results, ensure the handwriting is clear and the image is well-lit")
	gr.Markdown("- The system works best with dark text on light background")
	gr.Markdown("- Multiple page PDFs will show page breaks in the output")

	# Launch the app
	if __name__ == "__main__":
	app.launch()