from typing import List import pytesseract from PIL import Image import gradio as gr def tesseract_ocr(filepath: str, languages: List[str]): image = Image.open(filepath) oem_psm_config = '--oem 3 --psm 11 --tessdata-dir ./tessdata' return pytesseract.image_to_string(image=image, lang='+'.join(languages) if languages else None, config=oem_psm_config) title = "Shan Tesseract OCR" description = "Gradio demo for Tesseract-OCR Shan. Tesseract is an open source text recognition (OCR) Engine." article = "

" examples = [ ["examples/example2.png", ["eng", "shn"]], ["examples/example3.png", ["eng", "shn"]], ["examples/example4.png", ["eng", "shn"]], ["examples/example1.png", ["eng", "shn"]], ] with gr.Blocks(title=title) as demo: gr.Markdown(f'

{title}

') gr.Markdown(description) with gr.Row(): with gr.Column(): image = gr.Image(type="filepath", label="Input") language_choices = pytesseract.get_languages(config='--tessdata-dir ./tessdata') # get available languages from tessdata prefix with gr.Accordion("Languages", open=False): languages = gr.CheckboxGroup(language_choices, type="value", value=["eng", "shn"], label='language') with gr.Row(): btn_clear = gr.ClearButton([image, languages]) btn_submit = gr.Button(value="Submit", variant="primary") with gr.Column(): text = gr.Textbox(label="Output") btn_submit.click(tesseract_ocr, inputs=[image, languages], outputs=text, api_name="tesseract-ocr") btn_clear.add(text) gr.Examples( examples=examples, inputs=[image, languages], ) gr.Markdown(article) if __name__ == '__main__': demo.launch()