import streamlit as st import PyPDF2 import pytesseract from PIL import Image from pdf2image import convert_from_path def pdf_to_text(pdf_file): # Open the PDF file pdf = PyPDF2.PdfReader(pdf_file) # Extract the text from each page text = '' for page in pdf.pages: text += page.extract_text() # If the text is empty, use OCR to extract the text if not text: # Convert the PDF to images images = convert_from_path(pdf_file) # Perform OCR on each image for image in images: text += pytesseract.image_to_string(image) return text def main(): st.title("PDF Text Extractor") st.write("Upload a PDF file to extract the text") pdf_file = st.file_uploader("Upload PDF file", type=["pdf"]) if pdf_file is not None: text = pdf_to_text(pdf_file) st.write("Extracted Text:") st.code(text) # Save the text to a .txt file txt_file = pdf_file.name.replace('.pdf', '.txt') with open(txt_file, 'w') as f: f.write(text) st.write(f"Text saved to {txt_file}") if __name__ == "__main__": main()