pvyas96 commited on
Commit
f8883b1
·
verified ·
1 Parent(s): 97eaba0

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -0
app.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import PyPDF2
3
+ import pytesseract
4
+ from PIL import Image
5
+ from pdf2image import convert_from_path
6
+
7
+ def pdf_to_text(pdf_file):
8
+ # Open the PDF file
9
+ pdf = PyPDF2.PdfFileReader(pdf_file)
10
+
11
+ # Extract the text from each page
12
+ text = ''
13
+ for page in range(pdf.numPages):
14
+ page_obj = pdf.getPage(page)
15
+ text += page_obj.extractText()
16
+
17
+ # If the text is empty, use OCR to extract the text
18
+ if not text:
19
+ # Convert the PDF to images
20
+ images = convert_from_path(pdf_file)
21
+
22
+ # Perform OCR on each image
23
+ for image in images:
24
+ text += pytesseract.image_to_string(image)
25
+
26
+ return text
27
+
28
+ def main():
29
+ st.title("PDF Text Extractor")
30
+ st.write("Upload a PDF file to extract the text")
31
+
32
+ pdf_file = st.file_uploader("Upload PDF file", type=["pdf"])
33
+
34
+ if pdf_file is not None:
35
+ text = pdf_to_text(pdf_file)
36
+ st.write("Extracted Text:")
37
+ st.code(text)
38
+
39
+ # Save the text to a .txt file
40
+ txt_file = pdf_file.name.replace('.pdf', '.txt')
41
+ with open(txt_file, 'w') as f:
42
+ f.write(text)
43
+ st.write(f"Text saved to {txt_file}")
44
+
45
+ if __name__ == "__main__":
46
+ main()