Spaces:

DeepDiveDev
/

TransformoDocs-Demo

Sleeping

App Files Files Community

DeepDiveDev commited on Feb 27

Commit

e638a74

verified ·

1 Parent(s): c3163b4

Update app.py

Browse files

Files changed (1) hide show

app.py +122 -56

app.py CHANGED Viewed

@@ -1,63 +1,129 @@
-import torch
-import cv2
-import json
-import xml.etree.ElementTree as ET
 import gradio as gr
-from transformers import TrOCRProcessor, VisionEncoderDecoderModel, GPT2LMHeadModel, GPT2Tokenizer
-# Load OCR model (TrOCR)
-processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
-model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
-# Load GPT-2 model
-GPT2_model = GPT2LMHeadModel.from_pretrained("gpt2")
-GPT2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-# Image preprocessing
-def preprocess_image(image_path):
-    image = cv2.imread(image_path)
-    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-    return gray
-# Extract text using TrOCR (instead of Tesseract)
-def extract_text(image_path):
-    image = preprocess_image(image_path)
-    pixel_values = processor(image, return_tensors="pt").pixel_values
-    generated_ids = model.generate(pixel_values)
-    text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-    return text
-# Generate structured format (JSON/XML)
-def generate_machine_readable_format(text, format_type='json'):
-    if format_type == 'json':
-        return json.dumps({"content": text})
-    elif format_type == 'xml':
-        root = ET.Element("Document")
-        content = ET.SubElement(root, "Content")
-        content.text = text
-        return ET.tostring(root, encoding='unicode')
-    return text
-# GPT-2 for structured output
-def generate_structured_output(text):
-    inputs = GPT2_tokenizer.encode(text, return_tensors="pt")
-    outputs = GPT2_model.generate(inputs, max_length=500)
-    return GPT2_tokenizer.decode(outputs[0])
-# Convert document
-def convert_document(image, output_format='json'):
-    text = extract_text(image)
-    structured_output = generate_structured_output(text)
-    machine_readable_output = generate_machine_readable_format(structured_output, format_type=output_format)
-    return machine_readable_output
-# Gradio UI
-iface = gr.Interface(
-    fn=convert_document,
-    inputs=[gr.Image(type="filepath"), gr.Radio(["json", "xml"], label="Output Format")],
-    outputs="text",
-    title="Document OCR and Conversion",
-    description="Extracts text from images and converts it into structured JSON/XML format."
-)
-iface.launch()

 import gradio as gr
+import cv2
+import numpy as np
+import pytesseract
+from PIL import Image
+import io
+import matplotlib.pyplot as plt
+# Configure pytesseract path (adjust this based on your installation)
+# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Uncomment and modify for Windows
+def preprocess_image(image):
+    """Preprocess the image to improve OCR accuracy for handwritten text"""
+    # Convert to grayscale if it's a color image
+    if len(image.shape) == 3:
+        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    else:
+        gray = image.copy()
+    # Apply adaptive thresholding
+    thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                                   cv2.THRESH_BINARY_INV, 11, 2)
+    # Noise removal using morphological operations
+    kernel = np.ones((1, 1), np.uint8)
+    opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel)
+    # Dilate to connect components
+    kernel = np.ones((2, 2), np.uint8)
+    dilated = cv2.dilate(opening, kernel, iterations=1)
+    return dilated
+def perform_ocr(input_image):
+    """Process the image and perform OCR"""
+    if input_image is None:
+        return "No image provided", None
+    # Convert from RGB to BGR (OpenCV format)
+    image_np = np.array(input_image)
+    if len(image_np.shape) == 3:
+        image_np = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
+    # Preprocess the image
+    preprocessed = preprocess_image(image_np)
+    # Convert back to PIL for visualization
+    pil_preprocessed = Image.fromarray(preprocessed)
+    # Use pytesseract with specific configurations for handwritten text
+    custom_config = r'--oem 3 --psm 6 -l eng -c preserve_interword_spaces=1 tessedit_char_whitelist="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,;:\'\"()[]{}!?-+*/=><_%$#@&|~^`\\ "'
+    # Perform OCR
+    extracted_text = pytesseract.image_to_string(pil_preprocessed, config=custom_config)
+    # Return the extracted text and the preprocessed image for visualization
+    return extracted_text, pil_preprocessed
+def ocr_pipeline(input_image):
+    """Complete OCR pipeline with visualization"""
+    extracted_text, preprocessed_image = perform_ocr(input_image)
+    # Create visualization
+    if input_image is not None and preprocessed_image is not None:
+        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))
+        ax1.imshow(input_image)
+        ax1.set_title("Original Image")
+        ax1.axis("off")
+        ax2.imshow(preprocessed_image, cmap='gray')
+        ax2.set_title("Preprocessed Image")
+        ax2.axis("off")
+        plt.tight_layout()
+        # Convert plot to image
+        buf = io.BytesIO()
+        plt.savefig(buf, format='png')
+        buf.seek(0)
+        viz_img = Image.open(buf)
+        plt.close(fig)
+        return extracted_text, viz_img
+    return extracted_text, None
+# Create the Gradio interface
+with gr.Blocks(title="Handwritten OCR App") as app:
+    gr.Markdown("# Handwritten Text OCR Extraction")
+    gr.Markdown("""
+    This app extracts text from handwritten notes.
+    Upload an image containing handwritten text and the app will convert it to digital text.
+    """)
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(type="pil", label="Upload Handwritten Image")
+            run_button = gr.Button("Extract Text")
+        with gr.Column():
+            output_text = gr.Textbox(label="Extracted Text", lines=15)
+            processed_image = gr.Image(label="Preprocessing Visualization")
+    run_button.click(
+        fn=ocr_pipeline,
+        inputs=input_image,
+        outputs=[output_text, processed_image]
+    )
+    gr.Markdown("""
+    ## Tips for better results:
+    - Ensure good lighting and contrast in the image
+    - Try to keep the text as horizontal as possible
+    - Clear handwriting works best
+    - For better results, you may need to crop the image to focus on specific sections
+    """)
+    # Add example images
+    gr.Examples(
+        examples=[
+            "handwritten_sample.jpg",  # Replace with your example image paths
+        ],
+        inputs=input_image,
+    )
+# Launch the app
+if __name__ == "__main__":
+    app.launch()