DeepDiveDev commited on
Commit
b896977
·
verified ·
1 Parent(s): 9b7f683

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -28
app.py CHANGED
@@ -1,34 +1,96 @@
1
  import gradio as gr
2
- from PyPDF2 import PdfReader
3
- import io
 
 
 
 
 
4
 
5
- # Function to convert PDF to text (handles both byte data and file uploads)
6
- def pdf_to_text(file_input):
7
- # If the input is in byte format (i.e., it comes as raw bytes from a file or Base64 encoding)
8
- if isinstance(file_input, bytes):
9
- # Treat it as byte data and convert it to a file-like object
10
- pdf_file = io.BytesIO(file_input)
11
- else:
12
- # If it's a regular PDF file (file upload), open it from the file input
13
- pdf_file = file_input.name # This will get the file path if it's a regular file upload
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- try:
16
- reader = PdfReader(pdf_file)
17
- text = ""
18
- for page in reader.pages:
19
- text += page.extract_text()
20
- return text
21
- except Exception as e:
22
- return f"Error while processing the PDF: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- # Gradio interface: allow both file uploads and byte data input
25
- iface = gr.Interface(
26
- fn=pdf_to_text, # Function to call for text extraction
27
- inputs=gr.File(label="Upload PDF or send Byte data"), # File input
28
- outputs="text", # Output the extracted text
29
- title="PDF to Text Conversion",
30
- description="Upload a PDF file or send byte data (Base64 encoded) to extract its text."
31
- )
 
 
 
 
 
 
 
 
 
 
 
32
 
 
33
  if __name__ == "__main__":
34
- iface.launch()
 
1
  import gradio as gr
2
+ import pytesseract
3
+ from PIL import Image
4
+ import pdf2image
5
+ import tempfile
6
+ import os
7
+ import cv2
8
+ import numpy as np
9
 
10
+ # You may need to set the path to tesseract executable if it's not in PATH
11
+ # pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' # For Windows
12
+ # For Linux/Mac, ensure Tesseract is installed
13
+
14
+ def preprocess_image(img):
15
+ """Preprocess image to improve OCR accuracy for handwritten text"""
16
+ # Convert to grayscale
17
+ gray = cv2.cvtColor(np.array(img), cv2.COLOR_BGR2GRAY)
18
+
19
+ # Apply thresholding
20
+ _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
21
+
22
+ # Noise removal
23
+ kernel = np.ones((1, 1), np.uint8)
24
+ binary = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
25
+ binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
26
+
27
+ # Invert back
28
+ binary = 255 - binary
29
+
30
+ return Image.fromarray(binary)
31
+
32
+ def extract_text_from_image(img):
33
+ """Extract text from an image using OCR"""
34
+ # Preprocess for better handwriting recognition
35
+ processed_img = preprocess_image(img)
36
+
37
+ # Use pytesseract with configuration optimized for handwritten text
38
+ custom_config = r'--oem 3 --psm 6 -l eng -c tessedit_char_whitelist="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?@#$%^&*()-+=_:;\'\" "'
39
+ text = pytesseract.image_to_string(processed_img, config=custom_config)
40
+
41
+ return text.strip()
42
 
43
+ def extract_text_from_pdf(pdf_path):
44
+ """Extract text from all pages of a PDF file"""
45
+ # Convert PDF to images
46
+ with tempfile.TemporaryDirectory() as path:
47
+ images = pdf2image.convert_from_path(pdf_path, output_folder=path)
48
+
49
+ # Extract text from each page
50
+ full_text = []
51
+ for img in images:
52
+ text = extract_text_from_image(img)
53
+ full_text.append(text)
54
+
55
+ return "\n\n--- Page Break ---\n\n".join(full_text)
56
+
57
+ def process_file(file):
58
+ """Process the uploaded file (PDF or image)"""
59
+ if file is None:
60
+ return "No file uploaded. Please upload an image or PDF file."
61
+
62
+ file_extension = os.path.splitext(file.name)[1].lower()
63
+
64
+ if file_extension == ".pdf":
65
+ # Process PDF
66
+ return extract_text_from_pdf(file.name)
67
+ elif file_extension in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif"]:
68
+ # Process Image
69
+ img = Image.open(file.name)
70
+ return extract_text_from_image(img)
71
+ else:
72
+ return "Unsupported file format. Please upload a PDF or image file (JPG, PNG, BMP, TIFF)."
73
 
74
+ # Create Gradio interface
75
+ with gr.Blocks(title="Handwritten Text OCR Extractor") as app:
76
+ gr.Markdown("# Handwritten Text OCR Extraction Tool")
77
+ gr.Markdown("Upload an image or PDF containing handwritten text to extract the content.")
78
+
79
+ with gr.Row():
80
+ with gr.Column():
81
+ file_input = gr.File(label="Upload Image or PDF", file_types=["image", "pdf"])
82
+ extract_button = gr.Button("Extract Text")
83
+
84
+ with gr.Column():
85
+ text_output = gr.Textbox(label="Extracted Text", lines=10, placeholder="Extracted text will appear here...")
86
+
87
+ extract_button.click(fn=process_file, inputs=[file_input], outputs=[text_output])
88
+
89
+ gr.Markdown("### Notes:")
90
+ gr.Markdown("- For best results, ensure the handwriting is clear and the image is well-lit")
91
+ gr.Markdown("- The system works best with dark text on light background")
92
+ gr.Markdown("- Multiple page PDFs will show page breaks in the output")
93
 
94
+ # Launch the app
95
  if __name__ == "__main__":
96
+ app.launch()