Spaces:

podsni
/

Coverter-PDF-to-TXT

Runtime error

App Files Files Community

podsnigame commited on Mar 2, 2023

Commit

b34881d

1 Parent(s): 0b38fca

Add application file

Browse files

Files changed (5) hide show

app.py +136 -0
file_pages/entry.txt +0 -0
functions.py +106 -0
packages.txt +5 -0
requirements.txt +4 -0

app.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import streamlit as st
+import pdf2image
+import pytesseract
+from pytesseract import Output, TesseractError
+from functions import convert_pdf_to_txt_pages, convert_pdf_to_txt_file, save_pages, displayPDF, images_to_txt
+st.set_page_config(page_title="PDF to Text")
+html_temp = """
+            <div style="background-color:{};padding:1px">
+            </div>
+            """
+# st.markdown("""
+#     ## :outbox_tray: Text data extractor: PDF to Text
+# """)
+# st.markdown(html_temp.format("rgba(55, 53, 47, 0.16)"),unsafe_allow_html=True)
+st.markdown("""
+    ## Text data extractor: PDF to Text
+""")
+languages = {
+    'English': 'eng',
+    'French': 'fra',
+    'Arabic': 'ara',
+    'Spanish': 'spa',
+}
+with st.sidebar:
+    st.title(":outbox_tray: PDF to Text")
+    textOutput = st.selectbox(
+        "How do you want your output text?",
+        ('One text file (.txt)', 'Text file per page (ZIP)'))
+    ocr_box = st.checkbox('Enable OCR (scanned document)')
+    st.markdown(html_temp.format("rgba(55, 53, 47, 0.16)"),unsafe_allow_html=True)
+    st.markdown("""
+    # How does it work?
+    Simply load your PDF and convert it to single-page or multi-page text.
+    """)
+    st.markdown(html_temp.format("rgba(55, 53, 47, 0.16)"),unsafe_allow_html=True)
+    st.markdown("""
+    Made by [@nainia_ayoub](https://twitter.com/nainia_ayoub)
+    """)
+pdf_file = st.file_uploader("Load your PDF", type="pdf")
+hide="""
+<style>
+footer{
+	visibility: hidden;
+    	position: relative;
+}
+.viewerBadge_container__1QSob{
+  	visibility: hidden;
+}
+#MainMenu{
+	visibility: hidden;
+}
+<style>
+"""
+st.markdown(hide, unsafe_allow_html=True)
+if pdf_file:
+    path = pdf_file.read()
+    # display document
+    with st.expander("Display document"):
+        displayPDF(path)
+    if ocr_box:
+        option = st.selectbox('Select the document language', list(languages.keys()))
+    # pdf to text
+    if textOutput == 'One text file (.txt)':
+        if ocr_box:
+            texts, nbPages = images_to_txt(path, languages[option])
+            totalPages = "Pages: "+str(nbPages)+" in total"
+            text_data_f = "\n\n".join(texts)
+        else:
+            text_data_f, nbPages = convert_pdf_to_txt_file(pdf_file)
+            totalPages = "Pages: "+str(nbPages)+" in total"
+        st.info(totalPages)
+        st.download_button("Download txt file", text_data_f)
+    else:
+        if ocr_box:
+            text_data, nbPages = images_to_txt(path, languages[option])
+            totalPages = "Pages: "+str(nbPages)+" in total"
+        else:
+            text_data, nbPages = convert_pdf_to_txt_pages(pdf_file)
+            totalPages = "Pages: "+str(nbPages)+" in total"
+        st.info(totalPages)
+        zipPath = save_pages(text_data)
+        # download text data
+        with open(zipPath, "rb") as fp:
+            btn = st.download_button(
+                label="Download ZIP (txt)",
+                data=fp,
+                file_name="pdf_to_txt.zip",
+                mime="application/zip"
+            )
+    st.markdown('''
+    <a target="_blank" style="color: black" href="https://twitter.com/intent/tweet?text=You%20can%20extract%20text%20from%20your%20PDF%20using%20this%20PDF%20to%20Text%20streamlit%20app%20by%20@nainia_ayoub!%0A%0Ahttps://nainiayoub-pdf-text-data-extractor-app-p6hy0z.streamlit.app/">
+        <button class="btn">
+            Spread the word!
+        </button>
+    </a>
+    <style>
+    .btn{
+        display: inline-flex;
+        -moz-box-align: center;
+        align-items: center;
+        -moz-box-pack: center;
+        justify-content: center;
+        font-weight: 400;
+        padding: 0.25rem 0.75rem;
+        border-radius: 0.25rem;
+        margin: 0px;
+        line-height: 1.6;
+        color: rgb(49, 51, 63);
+        background-color: #fff;
+        width: auto;
+        user-select: none;
+        border: 1px solid rgba(49, 51, 63, 0.2);
+        }
+    .btn:hover{
+        color: #00acee;
+        background-color: #fff;
+        border: 1px solid #00acee;
+    }
+    </style>
+    ''',
+    unsafe_allow_html=True
+    )

file_pages/entry.txt ADDED Viewed

File without changes

functions.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import streamlit as st
+from zipfile import ZipFile
+from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
+from pdfminer.converter import TextConverter
+from pdfminer.layout import LAParams
+from pdfminer.pdfpage import PDFPage
+from io import StringIO
+import base64
+#------- OCR ------------
+import pdf2image
+import pytesseract
+from pytesseract import Output, TesseractError
+@st.cache
+def images_to_txt(path, language):
+    images = pdf2image.convert_from_bytes(path)
+    all_text = []
+    for i in images:
+        pil_im = i
+        text = pytesseract.image_to_string(pil_im, lang=language)
+        # ocr_dict = pytesseract.image_to_data(pil_im, lang='eng', output_type=Output.DICT)
+        # ocr_dict now holds all the OCR info including text and location on the image
+        # text = " ".join(ocr_dict['text'])
+        # text = re.sub('[ ]{2,}', '\n', text)
+        all_text.append(text)
+    return all_text, len(all_text)
+@st.cache
+def convert_pdf_to_txt_pages(path):
+    texts = []
+    rsrcmgr = PDFResourceManager()
+    retstr = StringIO()
+    laparams = LAParams()
+    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
+    # fp = open(path, 'rb')
+    interpreter = PDFPageInterpreter(rsrcmgr, device)
+    size = 0
+    c = 0
+    file_pages = PDFPage.get_pages(path)
+    nbPages = len(list(file_pages))
+    for page in PDFPage.get_pages(path):
+      interpreter.process_page(page)
+      t = retstr.getvalue()
+      if c == 0:
+        texts.append(t)
+      else:
+        texts.append(t[size:])
+      c = c+1
+      size = len(t)
+    # text = retstr.getvalue()
+    # fp.close()
+    device.close()
+    retstr.close()
+    return texts, nbPages
+@st.cache
+def convert_pdf_to_txt_file(path):
+    texts = []
+    rsrcmgr = PDFResourceManager()
+    retstr = StringIO()
+    laparams = LAParams()
+    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
+    # fp = open(path, 'rb')
+    interpreter = PDFPageInterpreter(rsrcmgr, device)
+    file_pages = PDFPage.get_pages(path)
+    nbPages = len(list(file_pages))
+    for page in PDFPage.get_pages(path):
+      interpreter.process_page(page)
+      t = retstr.getvalue()
+    # text = retstr.getvalue()
+    # fp.close()
+    device.close()
+    retstr.close()
+    return t, nbPages
+@st.cache
+def save_pages(pages):
+  files = []
+  for page in range(len(pages)):
+    filename = "page_"+str(page)+".txt"
+    with open("./file_pages/"+filename, 'w', encoding="utf-8") as file:
+      file.write(pages[page])
+      files.append(file.name)
+  # create zipfile object
+  zipPath = './file_pages/pdf_to_txt.zip'
+  zipObj = ZipFile(zipPath, 'w')
+  for f in files:
+    zipObj.write(f)
+  zipObj.close()
+  return zipPath
+def displayPDF(file):
+  # Opening file from file path
+  # with open(file, "rb") as f:
+  base64_pdf = base64.b64encode(file).decode('utf-8')
+  # Embedding PDF in HTML
+  pdf_display = F'<iframe src="data:application/pdf;base64,{base64_pdf}" width="700" height="1000" type="application/pdf"></iframe>'
+  # Displaying File
+  st.markdown(pdf_display, unsafe_allow_html=True)

packages.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+poppler-utils
+tesseract-ocr
+tesseract-ocr-spa
+tesseract-ocr-fra
+tesseract-ocr-ara

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+streamlit
+pdfminer==20191125
+pdf2image
+pytesseract