ValakiJay1706 commited on
Commit
c112dcd
·
verified ·
1 Parent(s): c0f1b96

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -12
app.py CHANGED
@@ -34,6 +34,7 @@ import uuid
34
  import time
35
  import asyncio
36
  import aiohttp
 
37
  # '-----------------'
38
  import smtplib
39
  from email.mime.multipart import MIMEMultipart
@@ -157,25 +158,70 @@ def display_info():
157
  # page = doc.load_page(page_num)
158
  # text += page.get_text()
159
  # return text
160
- # subprocess.run(["git", "clone", "https://github.com/tesseract-ocr/tesseract.git"])
161
- def get_pdf_text(pdf_file):
162
- doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
163
- text = ""
164
 
165
- for page_num in range(doc.page_count):
166
- page = doc.load_page(page_num)
167
- text += page.get_text()
 
 
 
 
168
 
169
- # Extract images from the page
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  image_list = page.get_images(full=True)
 
171
  for img_index, img in enumerate(image_list):
172
  xref = img[0]
173
- base_image = doc.extract_image(xref)
174
  image_bytes = base_image["image"]
 
175
  image = Image.open(io.BytesIO(image_bytes))
176
- text += pytesseract.image_to_string(image)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
- return text
179
 
180
 
181
  def save_feedback(question, answer, rating, options, context):
@@ -571,7 +617,13 @@ def main():
571
  file = st.file_uploader("Upload PDF Files")
572
  if file is not None:
573
  try:
574
- text = get_pdf_text(file)
 
 
 
 
 
 
575
  except Exception as e:
576
  st.error(f"Error reading PDF file: {str(e)}")
577
  text = None
 
34
  import time
35
  import asyncio
36
  import aiohttp
37
+ import easyocr
38
  # '-----------------'
39
  import smtplib
40
  from email.mime.multipart import MIMEMultipart
 
158
  # page = doc.load_page(page_num)
159
  # text += page.get_text()
160
  # return text
 
 
 
 
161
 
162
+ # def get_pdf_text(pdf_file):
163
+ # doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
164
+ # text = ""
165
+
166
+ # for page_num in range(doc.page_count):
167
+ # page = doc.load_page(page_num)
168
+ # text += page.get_text()
169
 
170
+ # # Extract images from the page
171
+ # image_list = page.get_images(full=True)
172
+ # for img_index, img in enumerate(image_list):
173
+ # xref = img[0]
174
+ # base_image = doc.extract_image(xref)
175
+ # image_bytes = base_image["image"]
176
+ # image = Image.open(io.BytesIO(image_bytes))
177
+ # text += pytesseract.image_to_string(image)
178
+
179
+ # return text
180
+
181
+
182
+ def extract_images_from_pdf(pdf_path):
183
+ """Extract images from the given PDF file."""
184
+ pdf_file = fitz.open(pdf_path)
185
+ images = []
186
+
187
+ for page_index in range(len(pdf_file)):
188
+ page = pdf_file.load_page(page_index)
189
  image_list = page.get_images(full=True)
190
+
191
  for img_index, img in enumerate(image_list):
192
  xref = img[0]
193
+ base_image = pdf_file.extract_image(xref)
194
  image_bytes = base_image["image"]
195
+ image_ext = base_image["ext"]
196
  image = Image.open(io.BytesIO(image_bytes))
197
+ images.append(image)
198
+
199
+ return images
200
+
201
+ def recognize_text(image):
202
+ """Recognize text from a single image."""
203
+ reader = easyocr.Reader(['en'])
204
+ result = reader.readtext(image)
205
+
206
+ recognized_text = ""
207
+ for (bbox, text, prob) in result:
208
+ if prob > 0.2:
209
+ recognized_text += f'{text}\n'
210
+
211
+ return recognized_text
212
+
213
+ def ocr_text_from_pdf(pdf_path):
214
+ """Extract text from all images in the PDF."""
215
+ images = extract_images_from_pdf(pdf_path)
216
+ all_text = ""
217
+
218
+ for image in images:
219
+ text = recognize_text(image)
220
+ all_text += text
221
+
222
+ return all_text
223
+
224
 
 
225
 
226
 
227
  def save_feedback(question, answer, rating, options, context):
 
617
  file = st.file_uploader("Upload PDF Files")
618
  if file is not None:
619
  try:
620
+ # pdf_path = "path/to/your/pdf_file.pdf"
621
+
622
+ # Extract text from the PDF
623
+ text = ocr_text_from_pdf(file)
624
+ # print(extracted_text)
625
+
626
+ # text = get_pdf_text(file)
627
  except Exception as e:
628
  st.error(f"Error reading PDF file: {str(e)}")
629
  text = None