Spaces:
Sleeping
Sleeping
Thomas Dehaene
commited on
Commit
·
d5101c4
1
Parent(s):
e168969
Add app
Browse files
README.md
CHANGED
|
@@ -1,37 +1,28 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: streamlit
|
| 7 |
app_file: app.py
|
| 8 |
pinned: false
|
| 9 |
---
|
| 10 |
-
|
| 11 |
# Configuration
|
| 12 |
-
|
| 13 |
`title`: _string_
|
| 14 |
Display title for the Space
|
| 15 |
-
|
| 16 |
`emoji`: _string_
|
| 17 |
Space emoji (emoji-only character allowed)
|
| 18 |
-
|
| 19 |
`colorFrom`: _string_
|
| 20 |
Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
|
| 21 |
-
|
| 22 |
`colorTo`: _string_
|
| 23 |
Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
|
| 24 |
-
|
| 25 |
`sdk`: _string_
|
| 26 |
Can be either `gradio` or `streamlit`
|
| 27 |
-
|
| 28 |
`sdk_version` : _string_
|
| 29 |
Only applicable for `streamlit` SDK.
|
| 30 |
See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
|
| 31 |
-
|
| 32 |
`app_file`: _string_
|
| 33 |
Path to your main application file (which contains either `gradio` or `streamlit` Python code).
|
| 34 |
Path is relative to the root of the repository.
|
| 35 |
-
|
| 36 |
`pinned`: _boolean_
|
| 37 |
-
Whether the Space stays on top of your list.
|
|
|
|
| 1 |
---
|
| 2 |
+
title: ByT5 dOCRtor
|
| 3 |
+
emoji: 💊
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: yellow
|
| 6 |
sdk: streamlit
|
| 7 |
app_file: app.py
|
| 8 |
pinned: false
|
| 9 |
---
|
|
|
|
| 10 |
# Configuration
|
|
|
|
| 11 |
`title`: _string_
|
| 12 |
Display title for the Space
|
|
|
|
| 13 |
`emoji`: _string_
|
| 14 |
Space emoji (emoji-only character allowed)
|
|
|
|
| 15 |
`colorFrom`: _string_
|
| 16 |
Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
|
|
|
|
| 17 |
`colorTo`: _string_
|
| 18 |
Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
|
|
|
|
| 19 |
`sdk`: _string_
|
| 20 |
Can be either `gradio` or `streamlit`
|
|
|
|
| 21 |
`sdk_version` : _string_
|
| 22 |
Only applicable for `streamlit` SDK.
|
| 23 |
See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
|
|
|
|
| 24 |
`app_file`: _string_
|
| 25 |
Path to your main application file (which contains either `gradio` or `streamlit` Python code).
|
| 26 |
Path is relative to the root of the repository.
|
|
|
|
| 27 |
`pinned`: _boolean_
|
| 28 |
+
Whether the Space stays on top of your list.
|
app.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from textwrap import wrap
|
| 2 |
+
|
| 3 |
+
import streamlit as st
|
| 4 |
+
from transformers import pipeline
|
| 5 |
+
import nlpaug.augmenter.char as nac
|
| 6 |
+
|
| 7 |
+
st.markdown('# ByT5 Dutch OCR Corrector :pill:')
|
| 8 |
+
st.write('This app corrects common dutch OCR mistakes, to showcase how this could be used in an OCR post-processing pipeline.')
|
| 9 |
+
|
| 10 |
+
st.markdown("""
|
| 11 |
+
To use this:
|
| 12 |
+
- Enter a text with OCR mistakes and hit 'unscramble':point_down:
|
| 13 |
+
- Or enter a normal text, scramble it :twisted_rightwards_arrows: and then hit 'unscramble' :point_down:""")
|
| 14 |
+
|
| 15 |
+
@st.cache(allow_output_mutation=True,
|
| 16 |
+
suppress_st_warning=True,
|
| 17 |
+
show_spinner=False)
|
| 18 |
+
def load_model():
|
| 19 |
+
with st.spinner('Please wait for the model to load...'):
|
| 20 |
+
ocr_pipeline=pipeline(
|
| 21 |
+
'text2text-generation',
|
| 22 |
+
model='ml6team/byt5-base-dutch-ocr-correction',
|
| 23 |
+
tokenizer='ml6team/byt5-base-dutch-ocr-correction'
|
| 24 |
+
)
|
| 25 |
+
return ocr_pipeline
|
| 26 |
+
|
| 27 |
+
ocr_pipeline = load_model()
|
| 28 |
+
|
| 29 |
+
if 'text' not in st.session_state:
|
| 30 |
+
st.session_state.text = ""
|
| 31 |
+
|
| 32 |
+
left_area, right_area = st.columns(2)
|
| 33 |
+
|
| 34 |
+
# Format the left area
|
| 35 |
+
left_area.header("Input")
|
| 36 |
+
form = left_area.form(key='ocrcorrector')
|
| 37 |
+
placeholder = form.empty()
|
| 38 |
+
placeholder.empty()
|
| 39 |
+
input_text = placeholder.text_area(value=st.session_state.text, label='Insert text:', key='input_text')
|
| 40 |
+
scramble_button = form.form_submit_button(label='Scramble')
|
| 41 |
+
submit_button = form.form_submit_button(label='Unscramble')
|
| 42 |
+
|
| 43 |
+
# Right area
|
| 44 |
+
right_area.header("Output")
|
| 45 |
+
|
| 46 |
+
if scramble_button:
|
| 47 |
+
aug = nac.OcrAug()
|
| 48 |
+
st.session_state.text = st.session_state.input_text
|
| 49 |
+
base_text = st.session_state.text
|
| 50 |
+
augmented_data = aug.augment(base_text)
|
| 51 |
+
st.session_state.text = augmented_data
|
| 52 |
+
del st.session_state.input_text
|
| 53 |
+
placeholder.empty()
|
| 54 |
+
input_text = placeholder.text_area(value=st.session_state.text, label='Insert text:', key='input_text')
|
| 55 |
+
|
| 56 |
+
if submit_button:
|
| 57 |
+
base_text = st.session_state.input_text
|
| 58 |
+
output_text = " ".join([x['generated_text'] for x in ocr_pipeline(wrap(base_text, 128))])
|
| 59 |
+
right_area.markdown('#####')
|
| 60 |
+
right_area.text_area(value=output_text, label="Corrected text:")
|
| 61 |
+
|