Soumen commited on
Commit
fb120e2
·
1 Parent(s): 9971dd3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -21
app.py CHANGED
@@ -49,28 +49,29 @@ import pytesseract
49
  #pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
50
  from PIL import Image
51
  @st.experimental_singleton
52
- # def read_pdf(file):
53
- # images=pdf2image.convert_from_bytes(file.read(),"rb")
54
- # #pdfReader = PdfFileReader(file)
55
- # #count = pdfReader.numPages
56
- # all_page_text = ""
57
- # for im in images:
58
- # #page = pdfReader.getPage(i)
59
- # img = Image.open(im)
60
- # img = img.save("img.png")
61
- # image_name = cv2.imread("img.png")
62
- # # get co-ordinates to cr
63
- # text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
64
- # all_page_text += text + " " #page.extractText()
65
- # return all_page_text
66
  def read_pdf_with_pdfplumber(file):
67
- all_page_text=" "
68
- with pdfplumber.open(file) as pdf:
69
- page = pdf.pages[0]
70
- ge=page.to_image()
71
- img = Image.open(ge)
72
- img = img.save("img.png")
73
- image_name = cv2.imread("img.png")
 
74
  # get co-ordinates to c
75
  #return page.extract_text()
76
  # get co-ordinates to cr
 
49
  #pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
50
  from PIL import Image
51
  @st.experimental_singleton
52
+ def read_pdf(file):
53
+ #images=pdf2image.convert_from_bytes(file.read(),"rb")
54
+ pdfReader = PdfFileReader(file)
55
+ count = pdfReader.numPages
56
+ all_page_text = ""
57
+ for i in range(count):
58
+ page = pdfReader.getPage(i)
59
+ img = Image.open(page.to_image())
60
+ img = img.save("img.png")
61
+ image_name = cv2.imread("img.png")
62
+ # get co-ordinates to cr
63
+ text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
64
+ all_page_text += text + " " #page.extractText()
65
+ return all_page_text
66
  def read_pdf_with_pdfplumber(file):
67
+ # all_page_text=" "
68
+ # # all_page_text = ""
69
+ # #with pdfplumber.open(file) as pdf:
70
+ # # page = pdf.pages[0]
71
+ # ge=page.to_image()
72
+ # img = Image.open(ge)
73
+ # img = img.save("img.png")
74
+ # image_name = cv2.imread("img.png")
75
  # get co-ordinates to c
76
  #return page.extract_text()
77
  # get co-ordinates to cr