PDF_to_TXT / app.py
pvyas96's picture
Create app.py
f8883b1 verified
raw
history blame
1.21 kB
import streamlit as st
import PyPDF2
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
def pdf_to_text(pdf_file):
# Open the PDF file
pdf = PyPDF2.PdfFileReader(pdf_file)
# Extract the text from each page
text = ''
for page in range(pdf.numPages):
page_obj = pdf.getPage(page)
text += page_obj.extractText()
# If the text is empty, use OCR to extract the text
if not text:
# Convert the PDF to images
images = convert_from_path(pdf_file)
# Perform OCR on each image
for image in images:
text += pytesseract.image_to_string(image)
return text
def main():
st.title("PDF Text Extractor")
st.write("Upload a PDF file to extract the text")
pdf_file = st.file_uploader("Upload PDF file", type=["pdf"])
if pdf_file is not None:
text = pdf_to_text(pdf_file)
st.write("Extracted Text:")
st.code(text)
# Save the text to a .txt file
txt_file = pdf_file.name.replace('.pdf', '.txt')
with open(txt_file, 'w') as f:
f.write(text)
st.write(f"Text saved to {txt_file}")
if __name__ == "__main__":
main()