podsnigame commited on
Commit
b34881d
·
1 Parent(s): 0b38fca

Add application file

Browse files
Files changed (5) hide show
  1. app.py +136 -0
  2. file_pages/entry.txt +0 -0
  3. functions.py +106 -0
  4. packages.txt +5 -0
  5. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pdf2image
3
+ import pytesseract
4
+ from pytesseract import Output, TesseractError
5
+ from functions import convert_pdf_to_txt_pages, convert_pdf_to_txt_file, save_pages, displayPDF, images_to_txt
6
+
7
+ st.set_page_config(page_title="PDF to Text")
8
+
9
+
10
+ html_temp = """
11
+ <div style="background-color:{};padding:1px">
12
+
13
+ </div>
14
+ """
15
+
16
+ # st.markdown("""
17
+ # ## :outbox_tray: Text data extractor: PDF to Text
18
+
19
+ # """)
20
+ # st.markdown(html_temp.format("rgba(55, 53, 47, 0.16)"),unsafe_allow_html=True)
21
+ st.markdown("""
22
+ ## Text data extractor: PDF to Text
23
+
24
+ """)
25
+ languages = {
26
+ 'English': 'eng',
27
+ 'French': 'fra',
28
+ 'Arabic': 'ara',
29
+ 'Spanish': 'spa',
30
+ }
31
+
32
+ with st.sidebar:
33
+ st.title(":outbox_tray: PDF to Text")
34
+ textOutput = st.selectbox(
35
+ "How do you want your output text?",
36
+ ('One text file (.txt)', 'Text file per page (ZIP)'))
37
+ ocr_box = st.checkbox('Enable OCR (scanned document)')
38
+
39
+ st.markdown(html_temp.format("rgba(55, 53, 47, 0.16)"),unsafe_allow_html=True)
40
+ st.markdown("""
41
+ # How does it work?
42
+ Simply load your PDF and convert it to single-page or multi-page text.
43
+ """)
44
+ st.markdown(html_temp.format("rgba(55, 53, 47, 0.16)"),unsafe_allow_html=True)
45
+ st.markdown("""
46
+ Made by [@nainia_ayoub](https://twitter.com/nainia_ayoub)
47
+ """)
48
+
49
+
50
+ pdf_file = st.file_uploader("Load your PDF", type="pdf")
51
+ hide="""
52
+ <style>
53
+ footer{
54
+ visibility: hidden;
55
+ position: relative;
56
+ }
57
+ .viewerBadge_container__1QSob{
58
+ visibility: hidden;
59
+ }
60
+ #MainMenu{
61
+ visibility: hidden;
62
+ }
63
+ <style>
64
+ """
65
+ st.markdown(hide, unsafe_allow_html=True)
66
+ if pdf_file:
67
+ path = pdf_file.read()
68
+ # display document
69
+ with st.expander("Display document"):
70
+ displayPDF(path)
71
+ if ocr_box:
72
+ option = st.selectbox('Select the document language', list(languages.keys()))
73
+ # pdf to text
74
+ if textOutput == 'One text file (.txt)':
75
+ if ocr_box:
76
+ texts, nbPages = images_to_txt(path, languages[option])
77
+ totalPages = "Pages: "+str(nbPages)+" in total"
78
+ text_data_f = "\n\n".join(texts)
79
+ else:
80
+ text_data_f, nbPages = convert_pdf_to_txt_file(pdf_file)
81
+ totalPages = "Pages: "+str(nbPages)+" in total"
82
+
83
+ st.info(totalPages)
84
+ st.download_button("Download txt file", text_data_f)
85
+ else:
86
+ if ocr_box:
87
+ text_data, nbPages = images_to_txt(path, languages[option])
88
+ totalPages = "Pages: "+str(nbPages)+" in total"
89
+ else:
90
+ text_data, nbPages = convert_pdf_to_txt_pages(pdf_file)
91
+ totalPages = "Pages: "+str(nbPages)+" in total"
92
+ st.info(totalPages)
93
+ zipPath = save_pages(text_data)
94
+ # download text data
95
+ with open(zipPath, "rb") as fp:
96
+ btn = st.download_button(
97
+ label="Download ZIP (txt)",
98
+ data=fp,
99
+ file_name="pdf_to_txt.zip",
100
+ mime="application/zip"
101
+ )
102
+
103
+ st.markdown('''
104
+ <a target="_blank" style="color: black" href="https://twitter.com/intent/tweet?text=You%20can%20extract%20text%20from%20your%20PDF%20using%20this%20PDF%20to%20Text%20streamlit%20app%20by%20@nainia_ayoub!%0A%0Ahttps://nainiayoub-pdf-text-data-extractor-app-p6hy0z.streamlit.app/">
105
+ <button class="btn">
106
+ Spread the word!
107
+ </button>
108
+ </a>
109
+ <style>
110
+ .btn{
111
+ display: inline-flex;
112
+ -moz-box-align: center;
113
+ align-items: center;
114
+ -moz-box-pack: center;
115
+ justify-content: center;
116
+ font-weight: 400;
117
+ padding: 0.25rem 0.75rem;
118
+ border-radius: 0.25rem;
119
+ margin: 0px;
120
+ line-height: 1.6;
121
+ color: rgb(49, 51, 63);
122
+ background-color: #fff;
123
+ width: auto;
124
+ user-select: none;
125
+ border: 1px solid rgba(49, 51, 63, 0.2);
126
+ }
127
+ .btn:hover{
128
+ color: #00acee;
129
+ background-color: #fff;
130
+ border: 1px solid #00acee;
131
+ }
132
+ </style>
133
+ ''',
134
+ unsafe_allow_html=True
135
+ )
136
+
file_pages/entry.txt ADDED
File without changes
functions.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from zipfile import ZipFile
3
+ from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
4
+ from pdfminer.converter import TextConverter
5
+ from pdfminer.layout import LAParams
6
+ from pdfminer.pdfpage import PDFPage
7
+ from io import StringIO
8
+ import base64
9
+ #------- OCR ------------
10
+ import pdf2image
11
+ import pytesseract
12
+ from pytesseract import Output, TesseractError
13
+
14
+ @st.cache
15
+ def images_to_txt(path, language):
16
+ images = pdf2image.convert_from_bytes(path)
17
+ all_text = []
18
+ for i in images:
19
+ pil_im = i
20
+ text = pytesseract.image_to_string(pil_im, lang=language)
21
+ # ocr_dict = pytesseract.image_to_data(pil_im, lang='eng', output_type=Output.DICT)
22
+ # ocr_dict now holds all the OCR info including text and location on the image
23
+ # text = " ".join(ocr_dict['text'])
24
+ # text = re.sub('[ ]{2,}', '\n', text)
25
+ all_text.append(text)
26
+ return all_text, len(all_text)
27
+
28
+ @st.cache
29
+ def convert_pdf_to_txt_pages(path):
30
+ texts = []
31
+ rsrcmgr = PDFResourceManager()
32
+ retstr = StringIO()
33
+ laparams = LAParams()
34
+ device = TextConverter(rsrcmgr, retstr, laparams=laparams)
35
+ # fp = open(path, 'rb')
36
+ interpreter = PDFPageInterpreter(rsrcmgr, device)
37
+ size = 0
38
+ c = 0
39
+ file_pages = PDFPage.get_pages(path)
40
+ nbPages = len(list(file_pages))
41
+ for page in PDFPage.get_pages(path):
42
+ interpreter.process_page(page)
43
+ t = retstr.getvalue()
44
+ if c == 0:
45
+ texts.append(t)
46
+ else:
47
+ texts.append(t[size:])
48
+ c = c+1
49
+ size = len(t)
50
+ # text = retstr.getvalue()
51
+
52
+ # fp.close()
53
+ device.close()
54
+ retstr.close()
55
+ return texts, nbPages
56
+
57
+ @st.cache
58
+ def convert_pdf_to_txt_file(path):
59
+ texts = []
60
+ rsrcmgr = PDFResourceManager()
61
+ retstr = StringIO()
62
+ laparams = LAParams()
63
+ device = TextConverter(rsrcmgr, retstr, laparams=laparams)
64
+ # fp = open(path, 'rb')
65
+ interpreter = PDFPageInterpreter(rsrcmgr, device)
66
+
67
+ file_pages = PDFPage.get_pages(path)
68
+ nbPages = len(list(file_pages))
69
+ for page in PDFPage.get_pages(path):
70
+ interpreter.process_page(page)
71
+ t = retstr.getvalue()
72
+ # text = retstr.getvalue()
73
+
74
+ # fp.close()
75
+ device.close()
76
+ retstr.close()
77
+ return t, nbPages
78
+
79
+ @st.cache
80
+ def save_pages(pages):
81
+
82
+ files = []
83
+ for page in range(len(pages)):
84
+ filename = "page_"+str(page)+".txt"
85
+ with open("./file_pages/"+filename, 'w', encoding="utf-8") as file:
86
+ file.write(pages[page])
87
+ files.append(file.name)
88
+
89
+ # create zipfile object
90
+ zipPath = './file_pages/pdf_to_txt.zip'
91
+ zipObj = ZipFile(zipPath, 'w')
92
+ for f in files:
93
+ zipObj.write(f)
94
+ zipObj.close()
95
+
96
+ return zipPath
97
+
98
+ def displayPDF(file):
99
+ # Opening file from file path
100
+ # with open(file, "rb") as f:
101
+ base64_pdf = base64.b64encode(file).decode('utf-8')
102
+
103
+ # Embedding PDF in HTML
104
+ pdf_display = F'<iframe src="data:application/pdf;base64,{base64_pdf}" width="700" height="1000" type="application/pdf"></iframe>'
105
+ # Displaying File
106
+ st.markdown(pdf_display, unsafe_allow_html=True)
packages.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ poppler-utils
2
+ tesseract-ocr
3
+ tesseract-ocr-spa
4
+ tesseract-ocr-fra
5
+ tesseract-ocr-ara
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ streamlit
2
+ pdfminer==20191125
3
+ pdf2image
4
+ pytesseract