Spaces:
Runtime error
Runtime error
Commit
·
b34881d
1
Parent(s):
0b38fca
Add application file
Browse files- app.py +136 -0
- file_pages/entry.txt +0 -0
- functions.py +106 -0
- packages.txt +5 -0
- requirements.txt +4 -0
app.py
ADDED
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pdf2image
|
3 |
+
import pytesseract
|
4 |
+
from pytesseract import Output, TesseractError
|
5 |
+
from functions import convert_pdf_to_txt_pages, convert_pdf_to_txt_file, save_pages, displayPDF, images_to_txt
|
6 |
+
|
7 |
+
st.set_page_config(page_title="PDF to Text")
|
8 |
+
|
9 |
+
|
10 |
+
html_temp = """
|
11 |
+
<div style="background-color:{};padding:1px">
|
12 |
+
|
13 |
+
</div>
|
14 |
+
"""
|
15 |
+
|
16 |
+
# st.markdown("""
|
17 |
+
# ## :outbox_tray: Text data extractor: PDF to Text
|
18 |
+
|
19 |
+
# """)
|
20 |
+
# st.markdown(html_temp.format("rgba(55, 53, 47, 0.16)"),unsafe_allow_html=True)
|
21 |
+
st.markdown("""
|
22 |
+
## Text data extractor: PDF to Text
|
23 |
+
|
24 |
+
""")
|
25 |
+
languages = {
|
26 |
+
'English': 'eng',
|
27 |
+
'French': 'fra',
|
28 |
+
'Arabic': 'ara',
|
29 |
+
'Spanish': 'spa',
|
30 |
+
}
|
31 |
+
|
32 |
+
with st.sidebar:
|
33 |
+
st.title(":outbox_tray: PDF to Text")
|
34 |
+
textOutput = st.selectbox(
|
35 |
+
"How do you want your output text?",
|
36 |
+
('One text file (.txt)', 'Text file per page (ZIP)'))
|
37 |
+
ocr_box = st.checkbox('Enable OCR (scanned document)')
|
38 |
+
|
39 |
+
st.markdown(html_temp.format("rgba(55, 53, 47, 0.16)"),unsafe_allow_html=True)
|
40 |
+
st.markdown("""
|
41 |
+
# How does it work?
|
42 |
+
Simply load your PDF and convert it to single-page or multi-page text.
|
43 |
+
""")
|
44 |
+
st.markdown(html_temp.format("rgba(55, 53, 47, 0.16)"),unsafe_allow_html=True)
|
45 |
+
st.markdown("""
|
46 |
+
Made by [@nainia_ayoub](https://twitter.com/nainia_ayoub)
|
47 |
+
""")
|
48 |
+
|
49 |
+
|
50 |
+
pdf_file = st.file_uploader("Load your PDF", type="pdf")
|
51 |
+
hide="""
|
52 |
+
<style>
|
53 |
+
footer{
|
54 |
+
visibility: hidden;
|
55 |
+
position: relative;
|
56 |
+
}
|
57 |
+
.viewerBadge_container__1QSob{
|
58 |
+
visibility: hidden;
|
59 |
+
}
|
60 |
+
#MainMenu{
|
61 |
+
visibility: hidden;
|
62 |
+
}
|
63 |
+
<style>
|
64 |
+
"""
|
65 |
+
st.markdown(hide, unsafe_allow_html=True)
|
66 |
+
if pdf_file:
|
67 |
+
path = pdf_file.read()
|
68 |
+
# display document
|
69 |
+
with st.expander("Display document"):
|
70 |
+
displayPDF(path)
|
71 |
+
if ocr_box:
|
72 |
+
option = st.selectbox('Select the document language', list(languages.keys()))
|
73 |
+
# pdf to text
|
74 |
+
if textOutput == 'One text file (.txt)':
|
75 |
+
if ocr_box:
|
76 |
+
texts, nbPages = images_to_txt(path, languages[option])
|
77 |
+
totalPages = "Pages: "+str(nbPages)+" in total"
|
78 |
+
text_data_f = "\n\n".join(texts)
|
79 |
+
else:
|
80 |
+
text_data_f, nbPages = convert_pdf_to_txt_file(pdf_file)
|
81 |
+
totalPages = "Pages: "+str(nbPages)+" in total"
|
82 |
+
|
83 |
+
st.info(totalPages)
|
84 |
+
st.download_button("Download txt file", text_data_f)
|
85 |
+
else:
|
86 |
+
if ocr_box:
|
87 |
+
text_data, nbPages = images_to_txt(path, languages[option])
|
88 |
+
totalPages = "Pages: "+str(nbPages)+" in total"
|
89 |
+
else:
|
90 |
+
text_data, nbPages = convert_pdf_to_txt_pages(pdf_file)
|
91 |
+
totalPages = "Pages: "+str(nbPages)+" in total"
|
92 |
+
st.info(totalPages)
|
93 |
+
zipPath = save_pages(text_data)
|
94 |
+
# download text data
|
95 |
+
with open(zipPath, "rb") as fp:
|
96 |
+
btn = st.download_button(
|
97 |
+
label="Download ZIP (txt)",
|
98 |
+
data=fp,
|
99 |
+
file_name="pdf_to_txt.zip",
|
100 |
+
mime="application/zip"
|
101 |
+
)
|
102 |
+
|
103 |
+
st.markdown('''
|
104 |
+
<a target="_blank" style="color: black" href="https://twitter.com/intent/tweet?text=You%20can%20extract%20text%20from%20your%20PDF%20using%20this%20PDF%20to%20Text%20streamlit%20app%20by%20@nainia_ayoub!%0A%0Ahttps://nainiayoub-pdf-text-data-extractor-app-p6hy0z.streamlit.app/">
|
105 |
+
<button class="btn">
|
106 |
+
Spread the word!
|
107 |
+
</button>
|
108 |
+
</a>
|
109 |
+
<style>
|
110 |
+
.btn{
|
111 |
+
display: inline-flex;
|
112 |
+
-moz-box-align: center;
|
113 |
+
align-items: center;
|
114 |
+
-moz-box-pack: center;
|
115 |
+
justify-content: center;
|
116 |
+
font-weight: 400;
|
117 |
+
padding: 0.25rem 0.75rem;
|
118 |
+
border-radius: 0.25rem;
|
119 |
+
margin: 0px;
|
120 |
+
line-height: 1.6;
|
121 |
+
color: rgb(49, 51, 63);
|
122 |
+
background-color: #fff;
|
123 |
+
width: auto;
|
124 |
+
user-select: none;
|
125 |
+
border: 1px solid rgba(49, 51, 63, 0.2);
|
126 |
+
}
|
127 |
+
.btn:hover{
|
128 |
+
color: #00acee;
|
129 |
+
background-color: #fff;
|
130 |
+
border: 1px solid #00acee;
|
131 |
+
}
|
132 |
+
</style>
|
133 |
+
''',
|
134 |
+
unsafe_allow_html=True
|
135 |
+
)
|
136 |
+
|
file_pages/entry.txt
ADDED
File without changes
|
functions.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from zipfile import ZipFile
|
3 |
+
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
4 |
+
from pdfminer.converter import TextConverter
|
5 |
+
from pdfminer.layout import LAParams
|
6 |
+
from pdfminer.pdfpage import PDFPage
|
7 |
+
from io import StringIO
|
8 |
+
import base64
|
9 |
+
#------- OCR ------------
|
10 |
+
import pdf2image
|
11 |
+
import pytesseract
|
12 |
+
from pytesseract import Output, TesseractError
|
13 |
+
|
14 |
+
@st.cache
|
15 |
+
def images_to_txt(path, language):
|
16 |
+
images = pdf2image.convert_from_bytes(path)
|
17 |
+
all_text = []
|
18 |
+
for i in images:
|
19 |
+
pil_im = i
|
20 |
+
text = pytesseract.image_to_string(pil_im, lang=language)
|
21 |
+
# ocr_dict = pytesseract.image_to_data(pil_im, lang='eng', output_type=Output.DICT)
|
22 |
+
# ocr_dict now holds all the OCR info including text and location on the image
|
23 |
+
# text = " ".join(ocr_dict['text'])
|
24 |
+
# text = re.sub('[ ]{2,}', '\n', text)
|
25 |
+
all_text.append(text)
|
26 |
+
return all_text, len(all_text)
|
27 |
+
|
28 |
+
@st.cache
|
29 |
+
def convert_pdf_to_txt_pages(path):
|
30 |
+
texts = []
|
31 |
+
rsrcmgr = PDFResourceManager()
|
32 |
+
retstr = StringIO()
|
33 |
+
laparams = LAParams()
|
34 |
+
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
|
35 |
+
# fp = open(path, 'rb')
|
36 |
+
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
37 |
+
size = 0
|
38 |
+
c = 0
|
39 |
+
file_pages = PDFPage.get_pages(path)
|
40 |
+
nbPages = len(list(file_pages))
|
41 |
+
for page in PDFPage.get_pages(path):
|
42 |
+
interpreter.process_page(page)
|
43 |
+
t = retstr.getvalue()
|
44 |
+
if c == 0:
|
45 |
+
texts.append(t)
|
46 |
+
else:
|
47 |
+
texts.append(t[size:])
|
48 |
+
c = c+1
|
49 |
+
size = len(t)
|
50 |
+
# text = retstr.getvalue()
|
51 |
+
|
52 |
+
# fp.close()
|
53 |
+
device.close()
|
54 |
+
retstr.close()
|
55 |
+
return texts, nbPages
|
56 |
+
|
57 |
+
@st.cache
|
58 |
+
def convert_pdf_to_txt_file(path):
|
59 |
+
texts = []
|
60 |
+
rsrcmgr = PDFResourceManager()
|
61 |
+
retstr = StringIO()
|
62 |
+
laparams = LAParams()
|
63 |
+
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
|
64 |
+
# fp = open(path, 'rb')
|
65 |
+
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
66 |
+
|
67 |
+
file_pages = PDFPage.get_pages(path)
|
68 |
+
nbPages = len(list(file_pages))
|
69 |
+
for page in PDFPage.get_pages(path):
|
70 |
+
interpreter.process_page(page)
|
71 |
+
t = retstr.getvalue()
|
72 |
+
# text = retstr.getvalue()
|
73 |
+
|
74 |
+
# fp.close()
|
75 |
+
device.close()
|
76 |
+
retstr.close()
|
77 |
+
return t, nbPages
|
78 |
+
|
79 |
+
@st.cache
|
80 |
+
def save_pages(pages):
|
81 |
+
|
82 |
+
files = []
|
83 |
+
for page in range(len(pages)):
|
84 |
+
filename = "page_"+str(page)+".txt"
|
85 |
+
with open("./file_pages/"+filename, 'w', encoding="utf-8") as file:
|
86 |
+
file.write(pages[page])
|
87 |
+
files.append(file.name)
|
88 |
+
|
89 |
+
# create zipfile object
|
90 |
+
zipPath = './file_pages/pdf_to_txt.zip'
|
91 |
+
zipObj = ZipFile(zipPath, 'w')
|
92 |
+
for f in files:
|
93 |
+
zipObj.write(f)
|
94 |
+
zipObj.close()
|
95 |
+
|
96 |
+
return zipPath
|
97 |
+
|
98 |
+
def displayPDF(file):
|
99 |
+
# Opening file from file path
|
100 |
+
# with open(file, "rb") as f:
|
101 |
+
base64_pdf = base64.b64encode(file).decode('utf-8')
|
102 |
+
|
103 |
+
# Embedding PDF in HTML
|
104 |
+
pdf_display = F'<iframe src="data:application/pdf;base64,{base64_pdf}" width="700" height="1000" type="application/pdf"></iframe>'
|
105 |
+
# Displaying File
|
106 |
+
st.markdown(pdf_display, unsafe_allow_html=True)
|
packages.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
poppler-utils
|
2 |
+
tesseract-ocr
|
3 |
+
tesseract-ocr-spa
|
4 |
+
tesseract-ocr-fra
|
5 |
+
tesseract-ocr-ara
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
pdfminer==20191125
|
3 |
+
pdf2image
|
4 |
+
pytesseract
|