podsnigame's picture
Add application file
b34881d
import streamlit as st
import pdf2image
import pytesseract
from pytesseract import Output, TesseractError
from functions import convert_pdf_to_txt_pages, convert_pdf_to_txt_file, save_pages, displayPDF, images_to_txt
st.set_page_config(page_title="PDF to Text")
html_temp = """
<div style="background-color:{};padding:1px">
</div>
"""
# st.markdown("""
# ## :outbox_tray: Text data extractor: PDF to Text
# """)
# st.markdown(html_temp.format("rgba(55, 53, 47, 0.16)"),unsafe_allow_html=True)
st.markdown("""
## Text data extractor: PDF to Text
""")
languages = {
'English': 'eng',
'French': 'fra',
'Arabic': 'ara',
'Spanish': 'spa',
}
with st.sidebar:
st.title(":outbox_tray: PDF to Text")
textOutput = st.selectbox(
"How do you want your output text?",
('One text file (.txt)', 'Text file per page (ZIP)'))
ocr_box = st.checkbox('Enable OCR (scanned document)')
st.markdown(html_temp.format("rgba(55, 53, 47, 0.16)"),unsafe_allow_html=True)
st.markdown("""
# How does it work?
Simply load your PDF and convert it to single-page or multi-page text.
""")
st.markdown(html_temp.format("rgba(55, 53, 47, 0.16)"),unsafe_allow_html=True)
st.markdown("""
Made by [@nainia_ayoub](https://twitter.com/nainia_ayoub)
""")
pdf_file = st.file_uploader("Load your PDF", type="pdf")
hide="""
<style>
footer{
visibility: hidden;
position: relative;
}
.viewerBadge_container__1QSob{
visibility: hidden;
}
#MainMenu{
visibility: hidden;
}
<style>
"""
st.markdown(hide, unsafe_allow_html=True)
if pdf_file:
path = pdf_file.read()
# display document
with st.expander("Display document"):
displayPDF(path)
if ocr_box:
option = st.selectbox('Select the document language', list(languages.keys()))
# pdf to text
if textOutput == 'One text file (.txt)':
if ocr_box:
texts, nbPages = images_to_txt(path, languages[option])
totalPages = "Pages: "+str(nbPages)+" in total"
text_data_f = "\n\n".join(texts)
else:
text_data_f, nbPages = convert_pdf_to_txt_file(pdf_file)
totalPages = "Pages: "+str(nbPages)+" in total"
st.info(totalPages)
st.download_button("Download txt file", text_data_f)
else:
if ocr_box:
text_data, nbPages = images_to_txt(path, languages[option])
totalPages = "Pages: "+str(nbPages)+" in total"
else:
text_data, nbPages = convert_pdf_to_txt_pages(pdf_file)
totalPages = "Pages: "+str(nbPages)+" in total"
st.info(totalPages)
zipPath = save_pages(text_data)
# download text data
with open(zipPath, "rb") as fp:
btn = st.download_button(
label="Download ZIP (txt)",
data=fp,
file_name="pdf_to_txt.zip",
mime="application/zip"
)
st.markdown('''
<a target="_blank" style="color: black" href="https://twitter.com/intent/tweet?text=You%20can%20extract%20text%20from%20your%20PDF%20using%20this%20PDF%20to%20Text%20streamlit%20app%20by%20@nainia_ayoub!%0A%0Ahttps://nainiayoub-pdf-text-data-extractor-app-p6hy0z.streamlit.app/">
<button class="btn">
Spread the word!
</button>
</a>
<style>
.btn{
display: inline-flex;
-moz-box-align: center;
align-items: center;
-moz-box-pack: center;
justify-content: center;
font-weight: 400;
padding: 0.25rem 0.75rem;
border-radius: 0.25rem;
margin: 0px;
line-height: 1.6;
color: rgb(49, 51, 63);
background-color: #fff;
width: auto;
user-select: none;
border: 1px solid rgba(49, 51, 63, 0.2);
}
.btn:hover{
color: #00acee;
background-color: #fff;
border: 1px solid #00acee;
}
</style>
''',
unsafe_allow_html=True
)