import streamlit as st import pdf2image import pytesseract from pytesseract import Output, TesseractError from functions import convert_pdf_to_txt_pages, convert_pdf_to_txt_file, save_pages, displayPDF, images_to_txt st.set_page_config(page_title="PDF to Text") html_temp = """
""" # st.markdown(""" # ## :outbox_tray: Text data extractor: PDF to Text # """) # st.markdown(html_temp.format("rgba(55, 53, 47, 0.16)"),unsafe_allow_html=True) st.markdown(""" ## Text data extractor: PDF to Text """) languages = { 'English': 'eng', 'French': 'fra', 'Arabic': 'ara', 'Spanish': 'spa', } with st.sidebar: st.title(":outbox_tray: PDF to Text") textOutput = st.selectbox( "How do you want your output text?", ('One text file (.txt)', 'Text file per page (ZIP)')) ocr_box = st.checkbox('Enable OCR (scanned document)') st.markdown(html_temp.format("rgba(55, 53, 47, 0.16)"),unsafe_allow_html=True) st.markdown(""" # How does it work? Simply load your PDF and convert it to single-page or multi-page text. """) st.markdown(html_temp.format("rgba(55, 53, 47, 0.16)"),unsafe_allow_html=True) st.markdown(""" Made by [@nainia_ayoub](https://twitter.com/nainia_ayoub) """) pdf_file = st.file_uploader("Load your PDF", type="pdf") hide=""" ''', unsafe_allow_html=True )