Spaces:

podsni
/

Coverter-PDF-to-TXT

Runtime error

App Files Files Community

Coverter-PDF-to-TXT / app.py

podsnigame

Add application file

b34881d over 2 years ago

raw

history blame contribute delete

4.07 kB

	import streamlit as st
	import pdf2image
	import pytesseract
	from pytesseract import Output, TesseractError
	from functions import convert_pdf_to_txt_pages, convert_pdf_to_txt_file, save_pages, displayPDF, images_to_txt

	st.set_page_config(page_title="PDF to Text")


	html_temp = """
	<div style="background-color:{};padding:1px">

	</div>
	"""

	# st.markdown("""
	# ## :outbox_tray: Text data extractor: PDF to Text

	# """)
	# st.markdown(html_temp.format("rgba(55, 53, 47, 0.16)"),unsafe_allow_html=True)
	st.markdown("""
	## Text data extractor: PDF to Text

	""")
	languages = {
	'English': 'eng',
	'French': 'fra',
	'Arabic': 'ara',
	'Spanish': 'spa',
	}

	with st.sidebar:
	st.title(":outbox_tray: PDF to Text")
	textOutput = st.selectbox(
	"How do you want your output text?",
	('One text file (.txt)', 'Text file per page (ZIP)'))
	ocr_box = st.checkbox('Enable OCR (scanned document)')

	st.markdown(html_temp.format("rgba(55, 53, 47, 0.16)"),unsafe_allow_html=True)
	st.markdown("""
	# How does it work?
	Simply load your PDF and convert it to single-page or multi-page text.
	""")
	st.markdown(html_temp.format("rgba(55, 53, 47, 0.16)"),unsafe_allow_html=True)
	st.markdown("""
	Made by [@nainia_ayoub](https://twitter.com/nainia_ayoub)
	""")


	pdf_file = st.file_uploader("Load your PDF", type="pdf")
	hide="""
	<style>
	footer{
	visibility: hidden;
	position: relative;
	}
	.viewerBadge_container__1QSob{
	visibility: hidden;
	}
	#MainMenu{
	visibility: hidden;
	}
	<style>
	"""
	st.markdown(hide, unsafe_allow_html=True)
	if pdf_file:
	path = pdf_file.read()
	# display document
	with st.expander("Display document"):
	displayPDF(path)
	if ocr_box:
	option = st.selectbox('Select the document language', list(languages.keys()))
	# pdf to text
	if textOutput == 'One text file (.txt)':
	if ocr_box:
	texts, nbPages = images_to_txt(path, languages[option])
	totalPages = "Pages: "+str(nbPages)+" in total"
	text_data_f = "\n\n".join(texts)
	else:
	text_data_f, nbPages = convert_pdf_to_txt_file(pdf_file)
	totalPages = "Pages: "+str(nbPages)+" in total"

	st.info(totalPages)
	st.download_button("Download txt file", text_data_f)
	else:
	if ocr_box:
	text_data, nbPages = images_to_txt(path, languages[option])
	totalPages = "Pages: "+str(nbPages)+" in total"
	else:
	text_data, nbPages = convert_pdf_to_txt_pages(pdf_file)
	totalPages = "Pages: "+str(nbPages)+" in total"
	st.info(totalPages)
	zipPath = save_pages(text_data)
	# download text data
	with open(zipPath, "rb") as fp:
	btn = st.download_button(
	label="Download ZIP (txt)",
	data=fp,
	file_name="pdf_to_txt.zip",
	mime="application/zip"
	)

	st.markdown('''
	<a target="_blank" style="color: black" href="https://twitter.com/intent/tweet?text=You%20can%20extract%20text%20from%20your%20PDF%20using%20this%20PDF%20to%20Text%20streamlit%20app%20by%20@nainia_ayoub!%0A%0Ahttps://nainiayoub-pdf-text-data-extractor-app-p6hy0z.streamlit.app/">
	<button class="btn">
	Spread the word!
	</button>
	</a>
	<style>
	.btn{
	display: inline-flex;
	-moz-box-align: center;
	align-items: center;
	-moz-box-pack: center;
	justify-content: center;
	font-weight: 400;
	padding: 0.25rem 0.75rem;
	border-radius: 0.25rem;
	margin: 0px;
	line-height: 1.6;
	color: rgb(49, 51, 63);
	background-color: #fff;
	width: auto;
	user-select: none;
	border: 1px solid rgba(49, 51, 63, 0.2);
	}
	.btn:hover{
	color: #00acee;
	background-color: #fff;
	border: 1px solid #00acee;
	}
	</style>
	''',
	unsafe_allow_html=True
	)