Spaces:

Soumen
/

Text-Summarization-and-NLP-tasks

Sleeping

Soumen commited on Nov 25, 2022

Commit

fb120e2

1 Parent(s): 9971dd3

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -49,28 +49,29 @@ import pytesseract
 #pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
 from PIL import Image
 @st.experimental_singleton
-# def read_pdf(file):
-#     images=pdf2image.convert_from_bytes(file.read(),"rb")
-#     #pdfReader = PdfFileReader(file)
-#     #count = pdfReader.numPages
-#     all_page_text = ""
-#     for im in images:
-#         #page = pdfReader.getPage(i)
-#         img = Image.open(im)
-#         img = img.save("img.png")
-#         image_name = cv2.imread("img.png")
-#         # get co-ordinates to cr
-#         text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
-#         all_page_text += text + " " #page.extractText()
-#     return all_page_text
 def read_pdf_with_pdfplumber(file):
-    all_page_text=" "
-    with pdfplumber.open(file) as pdf:
-	    page = pdf.pages[0]
-    ge=page.to_image()
-    img = Image.open(ge)
-    img = img.save("img.png")
-    image_name = cv2.imread("img.png")
 # get co-ordinates to c
         #return page.extract_text()
         # get co-ordinates to cr

 #pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
 from PIL import Image
 @st.experimental_singleton
+def read_pdf(file):
+    #images=pdf2image.convert_from_bytes(file.read(),"rb")
+    pdfReader = PdfFileReader(file)
+    count = pdfReader.numPages
+    all_page_text = ""
+    for i in range(count):
+        page = pdfReader.getPage(i)
+        img = Image.open(page.to_image())
+        img = img.save("img.png")
+        image_name = cv2.imread("img.png")
+        # get co-ordinates to cr
+        text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
+        all_page_text += text + " " #page.extractText()
+    return all_page_text
 def read_pdf_with_pdfplumber(file):
+#     all_page_text=" "
+# #     all_page_text = ""
+#     #with pdfplumber.open(file) as pdf:
+# 	   # page = pdf.pages[0]
+#     ge=page.to_image()
+#     img = Image.open(ge)
+#     img = img.save("img.png")
+#     image_name = cv2.imread("img.png")
 # get co-ordinates to c
         #return page.extract_text()
         # get co-ordinates to cr