DeepDiveDev's picture
Update app.py
13a649f verified
raw
history blame
3.74 kB
import gradio as gr
import pytesseract
from PIL import Image
import pdf2image
import tempfile
import os
import cv2
import numpy as np
# You may need to set the path to tesseract executable if it's not in PATH
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' # For Windows
# For Linux/Mac, ensure Tesseract is installed
def preprocess_image(img):
"""Preprocess image to improve OCR accuracy for handwritten text"""
# Convert to grayscale
img_array = np.array(img)
# Check if the image is already grayscale
if len(img_array.shape) == 2:
gray = img_array
else:
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
# Apply thresholding
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
# Noise removal
kernel = np.ones((1, 1), np.uint8)
binary = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
# Invert back
binary = 255 - binary
return Image.fromarray(binary)
def extract_text_from_image(img):
"""Extract text from an image using OCR"""
# Preprocess for better handwriting recognition
processed_img = preprocess_image(img)
# Use pytesseract with configuration optimized for handwritten text
custom_config = r'--oem 3 --psm 6 -l eng -c tessedit_char_whitelist="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?@#$%^&*()-+=_:;\'\" "'
text = pytesseract.image_to_string(processed_img, config=custom_config)
return text.strip()
def extract_text_from_pdf(pdf_path):
"""Extract text from all pages of a PDF file"""
# Convert PDF to images
with tempfile.TemporaryDirectory() as path:
images = pdf2image.convert_from_path(pdf_path, output_folder=path)
# Extract text from each page
full_text = []
for img in images:
text = extract_text_from_image(img)
full_text.append(text)
return "\n\n--- Page Break ---\n\n".join(full_text)
def process_file(file):
"""Process the uploaded file (PDF or image)"""
if file is None:
return "No file uploaded. Please upload an image or PDF file."
file_extension = os.path.splitext(file.name)[1].lower()
if file_extension == ".pdf":
# Process PDF
return extract_text_from_pdf(file.name)
elif file_extension in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif"]:
# Process Image
img = Image.open(file.name)
return extract_text_from_image(img)
else:
return "Unsupported file format. Please upload a PDF or image file (JPG, PNG, BMP, TIFF)."
# Create Gradio interface
with gr.Blocks(title="Handwritten Text OCR Extractor") as app:
gr.Markdown("# Handwritten Text OCR Extraction Tool")
gr.Markdown("Upload an image or PDF containing handwritten text to extract the content.")
with gr.Row():
with gr.Column():
file_input = gr.File(label="Upload Image or PDF")
extract_button = gr.Button("Extract Text")
with gr.Column():
text_output = gr.Textbox(label="Extracted Text", lines=10, placeholder="Extracted text will appear here...")
extract_button.click(fn=process_file, inputs=[file_input], outputs=[text_output])
gr.Markdown("### Notes:")
gr.Markdown("- For best results, ensure the handwriting is clear and the image is well-lit")
gr.Markdown("- The system works best with dark text on light background")
gr.Markdown("- Multiple page PDFs will show page breaks in the output")
# Launch the app
if __name__ == "__main__":
app.launch()