Spaces:

VyasAI
/

PDF_to_TXT

Sleeping

PDF_to_TXT / app.py

Update app.py

f377843 verified 10 months ago

1.25 kB

	import streamlit as st
	import PyPDF2
	import pytesseract
	from PIL import Image
	from pdf2image import convert_from_path

	def pdf_to_text(pdf_file):
	# Open the PDF file
	pdf = PyPDF2.PdfReader(pdf_file)

	# Extract the text from each page
	text = ''
	for page in pdf.pages:
	text += page.extract_text()

	# If the text is empty, use OCR to extract the text
	if not text:
	# Convert the PDF to images
	images = convert_from_path(pdf_file)

	# Perform OCR on each image
	for image in images:
	text += pytesseract.image_to_string(image)

	return text

	def main():
	st.title("PDF Text Extractor")
	st.write("Upload a PDF file to extract the text")

	pdf_file = st.file_uploader("Upload PDF file", type=["pdf"])

	if pdf_file is not None:
	text = pdf_to_text(pdf_file)
	txt_file = pdf_file.name.replace('.pdf', '.txt')
	with open(txt_file, 'w') as f:
	f.write(text)

	with open(txt_file, "rb") as file:
	btn = st.download_button(
	label="Download Extracted Text",
	data=file,
	file_name=txt_file,
	mime="text/plain"
	)

	if __name__ == "__main__":
	main()