Spaces:
Sleeping
Sleeping
vteam27
commited on
Commit
·
884f6b2
1
Parent(s):
45a9477
added searchable pdf for english ocr
Browse files
app.py
CHANGED
|
@@ -7,6 +7,8 @@ from doctr.io import DocumentFile
|
|
| 7 |
from doctr.models import ocr_predictor
|
| 8 |
import gradio as gr
|
| 9 |
from PIL import Image
|
|
|
|
|
|
|
| 10 |
from happytransformer import HappyTextToText, TTSettings
|
| 11 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,logging
|
| 12 |
from transformers.integrations import deepspeed
|
|
@@ -74,13 +76,13 @@ def greet(img, apply_grammar_correction, apply_spell_check,lang_of_input):
|
|
| 74 |
res = pt.image_to_string(img,lang='hin')
|
| 75 |
_output_name = "RESULT_OCR.txt"
|
| 76 |
open(_output_name, 'w').write(res)
|
| 77 |
-
return res, _output_name
|
| 78 |
|
| 79 |
if (lang_of_input=="Punjabi"):
|
| 80 |
res = pt.image_to_string(img,lang='pan')
|
| 81 |
_output_name = "RESULT_OCR.txt"
|
| 82 |
open(_output_name, 'w').write(res)
|
| 83 |
-
return res, _output_name
|
| 84 |
|
| 85 |
|
| 86 |
img.save("out.jpg")
|
|
@@ -106,7 +108,19 @@ def greet(img, apply_grammar_correction, apply_spell_check,lang_of_input):
|
|
| 106 |
|
| 107 |
_output_name = "RESULT_OCR.txt"
|
| 108 |
open(_output_name, 'w').write(res)
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
# Gradio Interface for OCR
|
| 112 |
demo_ocr = gr.Interface(
|
|
@@ -117,7 +131,7 @@ demo_ocr = gr.Interface(
|
|
| 117 |
gr.Checkbox(label="Apply Spell Check"),
|
| 118 |
gr.Dropdown(["English","Hindi","Punjabi"],label="Select Language")
|
| 119 |
],
|
| 120 |
-
outputs=["text", "file"],
|
| 121 |
title="DocTR OCR with Grammar and Spell Check",
|
| 122 |
description="Upload an image to get the OCR results. Optionally, apply grammar and spell check.",
|
| 123 |
examples=[["Examples/Book.png"], ["Examples/News.png"], ["Examples/Manuscript.jpg"], ["Examples/Files.jpg"],["Examples/Hindi.jpg"],["Examples/Hindi-manu.jpg"],["Examples/Punjabi_machine.png"]]
|
|
|
|
| 7 |
from doctr.models import ocr_predictor
|
| 8 |
import gradio as gr
|
| 9 |
from PIL import Image
|
| 10 |
+
import base64
|
| 11 |
+
from utils import HocrParser
|
| 12 |
from happytransformer import HappyTextToText, TTSettings
|
| 13 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,logging
|
| 14 |
from transformers.integrations import deepspeed
|
|
|
|
| 76 |
res = pt.image_to_string(img,lang='hin')
|
| 77 |
_output_name = "RESULT_OCR.txt"
|
| 78 |
open(_output_name, 'w').write(res)
|
| 79 |
+
return res, _output_name, None
|
| 80 |
|
| 81 |
if (lang_of_input=="Punjabi"):
|
| 82 |
res = pt.image_to_string(img,lang='pan')
|
| 83 |
_output_name = "RESULT_OCR.txt"
|
| 84 |
open(_output_name, 'w').write(res)
|
| 85 |
+
return res, _output_name, None
|
| 86 |
|
| 87 |
|
| 88 |
img.save("out.jpg")
|
|
|
|
| 108 |
|
| 109 |
_output_name = "RESULT_OCR.txt"
|
| 110 |
open(_output_name, 'w').write(res)
|
| 111 |
+
|
| 112 |
+
# Convert OCR output to searchable PDF
|
| 113 |
+
_output_name_pdf="RESULT_OCR.pdf"
|
| 114 |
+
xml_outputs = output.export_as_xml()
|
| 115 |
+
parser = HocrParser()
|
| 116 |
+
base64_encoded_pdfs = list()
|
| 117 |
+
for i, (xml, img) in enumerate(zip(xml_outputs, doc)):
|
| 118 |
+
xml_element_tree = xml[1]
|
| 119 |
+
parser.export_pdfa(_output_name_pdf,
|
| 120 |
+
hocr=xml_element_tree, image=img)
|
| 121 |
+
with open(_output_name_pdf, 'rb') as f:
|
| 122 |
+
base64_encoded_pdfs.append(base64.b64encode(f.read()))
|
| 123 |
+
return res, _output_name, _output_name_pdf
|
| 124 |
|
| 125 |
# Gradio Interface for OCR
|
| 126 |
demo_ocr = gr.Interface(
|
|
|
|
| 131 |
gr.Checkbox(label="Apply Spell Check"),
|
| 132 |
gr.Dropdown(["English","Hindi","Punjabi"],label="Select Language")
|
| 133 |
],
|
| 134 |
+
outputs=["text", "file", "file"],
|
| 135 |
title="DocTR OCR with Grammar and Spell Check",
|
| 136 |
description="Upload an image to get the OCR results. Optionally, apply grammar and spell check.",
|
| 137 |
examples=[["Examples/Book.png"], ["Examples/News.png"], ["Examples/Manuscript.jpg"], ["Examples/Files.jpg"],["Examples/Hindi.jpg"],["Examples/Hindi-manu.jpg"],["Examples/Punjabi_machine.png"]]
|