import streamlit as st
import pdf2image
import pytesseract
from pytesseract import Output, TesseractError
from functions import convert_pdf_to_txt_pages, convert_pdf_to_txt_file, save_pages, displayPDF, images_to_txt
st.set_page_config(page_title="PDF to Text")
html_temp = """
"""
# st.markdown("""
# ## :outbox_tray: Text data extractor: PDF to Text
# """)
# st.markdown(html_temp.format("rgba(55, 53, 47, 0.16)"),unsafe_allow_html=True)
st.markdown("""
## Text data extractor: PDF to Text
""")
languages = {
'English': 'eng',
'French': 'fra',
'Arabic': 'ara',
'Spanish': 'spa',
}
with st.sidebar:
st.title(":outbox_tray: PDF to Text")
textOutput = st.selectbox(
"How do you want your output text?",
('One text file (.txt)', 'Text file per page (ZIP)'))
ocr_box = st.checkbox('Enable OCR (scanned document)')
st.markdown(html_temp.format("rgba(55, 53, 47, 0.16)"),unsafe_allow_html=True)
st.markdown("""
# How does it work?
Simply load your PDF and convert it to single-page or multi-page text.
""")
st.markdown(html_temp.format("rgba(55, 53, 47, 0.16)"),unsafe_allow_html=True)
st.markdown("""
Made by [@nainia_ayoub](https://twitter.com/nainia_ayoub)
""")
pdf_file = st.file_uploader("Load your PDF", type="pdf")
hide="""
''',
unsafe_allow_html=True
)