Spaces:
Runtime error
Runtime error
File size: 4,070 Bytes
b34881d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import streamlit as st
import pdf2image
import pytesseract
from pytesseract import Output, TesseractError
from functions import convert_pdf_to_txt_pages, convert_pdf_to_txt_file, save_pages, displayPDF, images_to_txt
st.set_page_config(page_title="PDF to Text")
html_temp = """
<div style="background-color:{};padding:1px">
</div>
"""
# st.markdown("""
# ## :outbox_tray: Text data extractor: PDF to Text
# """)
# st.markdown(html_temp.format("rgba(55, 53, 47, 0.16)"),unsafe_allow_html=True)
st.markdown("""
## Text data extractor: PDF to Text
""")
languages = {
'English': 'eng',
'French': 'fra',
'Arabic': 'ara',
'Spanish': 'spa',
}
with st.sidebar:
st.title(":outbox_tray: PDF to Text")
textOutput = st.selectbox(
"How do you want your output text?",
('One text file (.txt)', 'Text file per page (ZIP)'))
ocr_box = st.checkbox('Enable OCR (scanned document)')
st.markdown(html_temp.format("rgba(55, 53, 47, 0.16)"),unsafe_allow_html=True)
st.markdown("""
# How does it work?
Simply load your PDF and convert it to single-page or multi-page text.
""")
st.markdown(html_temp.format("rgba(55, 53, 47, 0.16)"),unsafe_allow_html=True)
st.markdown("""
Made by [@nainia_ayoub](https://twitter.com/nainia_ayoub)
""")
pdf_file = st.file_uploader("Load your PDF", type="pdf")
hide="""
<style>
footer{
visibility: hidden;
position: relative;
}
.viewerBadge_container__1QSob{
visibility: hidden;
}
#MainMenu{
visibility: hidden;
}
<style>
"""
st.markdown(hide, unsafe_allow_html=True)
if pdf_file:
path = pdf_file.read()
# display document
with st.expander("Display document"):
displayPDF(path)
if ocr_box:
option = st.selectbox('Select the document language', list(languages.keys()))
# pdf to text
if textOutput == 'One text file (.txt)':
if ocr_box:
texts, nbPages = images_to_txt(path, languages[option])
totalPages = "Pages: "+str(nbPages)+" in total"
text_data_f = "\n\n".join(texts)
else:
text_data_f, nbPages = convert_pdf_to_txt_file(pdf_file)
totalPages = "Pages: "+str(nbPages)+" in total"
st.info(totalPages)
st.download_button("Download txt file", text_data_f)
else:
if ocr_box:
text_data, nbPages = images_to_txt(path, languages[option])
totalPages = "Pages: "+str(nbPages)+" in total"
else:
text_data, nbPages = convert_pdf_to_txt_pages(pdf_file)
totalPages = "Pages: "+str(nbPages)+" in total"
st.info(totalPages)
zipPath = save_pages(text_data)
# download text data
with open(zipPath, "rb") as fp:
btn = st.download_button(
label="Download ZIP (txt)",
data=fp,
file_name="pdf_to_txt.zip",
mime="application/zip"
)
st.markdown('''
<a target="_blank" style="color: black" href="https://twitter.com/intent/tweet?text=You%20can%20extract%20text%20from%20your%20PDF%20using%20this%20PDF%20to%20Text%20streamlit%20app%20by%20@nainia_ayoub!%0A%0Ahttps://nainiayoub-pdf-text-data-extractor-app-p6hy0z.streamlit.app/">
<button class="btn">
Spread the word!
</button>
</a>
<style>
.btn{
display: inline-flex;
-moz-box-align: center;
align-items: center;
-moz-box-pack: center;
justify-content: center;
font-weight: 400;
padding: 0.25rem 0.75rem;
border-radius: 0.25rem;
margin: 0px;
line-height: 1.6;
color: rgb(49, 51, 63);
background-color: #fff;
width: auto;
user-select: none;
border: 1px solid rgba(49, 51, 63, 0.2);
}
.btn:hover{
color: #00acee;
background-color: #fff;
border: 1px solid #00acee;
}
</style>
''',
unsafe_allow_html=True
)
|