PDF_to_TXT / app.py
pvyas96's picture
Update app.py
f377843 verified
import streamlit as st
import PyPDF2
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
def pdf_to_text(pdf_file):
# Open the PDF file
pdf = PyPDF2.PdfReader(pdf_file)
# Extract the text from each page
text = ''
for page in pdf.pages:
text += page.extract_text()
# If the text is empty, use OCR to extract the text
if not text:
# Convert the PDF to images
images = convert_from_path(pdf_file)
# Perform OCR on each image
for image in images:
text += pytesseract.image_to_string(image)
return text
def main():
st.title("PDF Text Extractor")
st.write("Upload a PDF file to extract the text")
pdf_file = st.file_uploader("Upload PDF file", type=["pdf"])
if pdf_file is not None:
text = pdf_to_text(pdf_file)
txt_file = pdf_file.name.replace('.pdf', '.txt')
with open(txt_file, 'w') as f:
f.write(text)
with open(txt_file, "rb") as file:
btn = st.download_button(
label="Download Extracted Text",
data=file,
file_name=txt_file,
mime="text/plain"
)
if __name__ == "__main__":
main()