import gradio as gr import pytesseract from PIL import Image import pdf2image import tempfile import os import cv2 import numpy as np # You may need to set the path to tesseract executable if it's not in PATH # pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' # For Windows # For Linux/Mac, ensure Tesseract is installed def preprocess_image(img): """Preprocess image to improve OCR accuracy for handwritten text""" # Convert to grayscale gray = cv2.cvtColor(np.array(img), cv2.COLOR_BGR2GRAY) # Apply thresholding _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) # Noise removal kernel = np.ones((1, 1), np.uint8) binary = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel) binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel) # Invert back binary = 255 - binary return Image.fromarray(binary) def extract_text_from_image(img): """Extract text from an image using OCR""" # Preprocess for better handwriting recognition processed_img = preprocess_image(img) # Use pytesseract with configuration optimized for handwritten text custom_config = r'--oem 3 --psm 6 -l eng -c tessedit_char_whitelist="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?@#$%^&*()-+=_:;\'\" "' text = pytesseract.image_to_string(processed_img, config=custom_config) return text.strip() def extract_text_from_pdf(pdf_path): """Extract text from all pages of a PDF file""" # Convert PDF to images with tempfile.TemporaryDirectory() as path: images = pdf2image.convert_from_path(pdf_path, output_folder=path) # Extract text from each page full_text = [] for img in images: text = extract_text_from_image(img) full_text.append(text) return "\n\n--- Page Break ---\n\n".join(full_text) def process_file(file): """Process the uploaded file (PDF or image)""" if file is None: return "No file uploaded. Please upload an image or PDF file." file_extension = os.path.splitext(file.name)[1].lower() if file_extension == ".pdf": # Process PDF return extract_text_from_pdf(file.name) elif file_extension in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif"]: # Process Image img = Image.open(file.name) return extract_text_from_image(img) else: return "Unsupported file format. Please upload a PDF or image file (JPG, PNG, BMP, TIFF)." # Create Gradio interface with gr.Blocks(title="Handwritten Text OCR Extractor") as app: gr.Markdown("# Handwritten Text OCR Extraction Tool") gr.Markdown("Upload an image or PDF containing handwritten text to extract the content.") with gr.Row(): with gr.Column(): file_input = gr.File(label="Upload Image or PDF", file_types=["image", "pdf"]) extract_button = gr.Button("Extract Text") with gr.Column(): text_output = gr.Textbox(label="Extracted Text", lines=10, placeholder="Extracted text will appear here...") extract_button.click(fn=process_file, inputs=[file_input], outputs=[text_output]) gr.Markdown("### Notes:") gr.Markdown("- For best results, ensure the handwriting is clear and the image is well-lit") gr.Markdown("- The system works best with dark text on light background") gr.Markdown("- Multiple page PDFs will show page breaks in the output") # Launch the app if __name__ == "__main__": app.launch()