Jay Valaki commited on
Commit
a14c6cb
·
verified ·
1 Parent(s): 0003def

The implementation of Object Character Recognition (OCR).

Files changed (1) hide show
  1. app.py +25 -1
app.py CHANGED
@@ -26,6 +26,10 @@ from spellchecker import SpellChecker
26
  from transformers import pipeline
27
  import re
28
  import pymupdf
 
 
 
 
29
  import uuid
30
  import time
31
  import asyncio
@@ -146,14 +150,34 @@ def display_info():
146
 
147
  """)
148
 
 
 
 
 
 
 
 
 
149
  def get_pdf_text(pdf_file):
150
- doc = pymupdf.open(stream=pdf_file.read(), filetype="pdf")
151
  text = ""
 
152
  for page_num in range(doc.page_count):
153
  page = doc.load_page(page_num)
154
  text += page.get_text()
 
 
 
 
 
 
 
 
 
 
155
  return text
156
 
 
157
  def save_feedback(question, answer, rating, options, context):
158
  feedback_file = 'question_feedback.json'
159
  if os.path.exists(feedback_file):
 
26
  from transformers import pipeline
27
  import re
28
  import pymupdf
29
+ import fitz # PyMuPDF
30
+ import pytesseract
31
+ from PIL import Image
32
+ import io
33
  import uuid
34
  import time
35
  import asyncio
 
150
 
151
  """)
152
 
153
+ # def get_pdf_text(pdf_file):
154
+ # doc = pymupdf.open(stream=pdf_file.read(), filetype="pdf")
155
+ # text = ""
156
+ # for page_num in range(doc.page_count):
157
+ # page = doc.load_page(page_num)
158
+ # text += page.get_text()
159
+ # return text
160
+
161
  def get_pdf_text(pdf_file):
162
+ doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
163
  text = ""
164
+
165
  for page_num in range(doc.page_count):
166
  page = doc.load_page(page_num)
167
  text += page.get_text()
168
+
169
+ # Extract images from the page
170
+ image_list = page.get_images(full=True)
171
+ for img_index, img in enumerate(image_list):
172
+ xref = img[0]
173
+ base_image = doc.extract_image(xref)
174
+ image_bytes = base_image["image"]
175
+ image = Image.open(io.BytesIO(image_bytes))
176
+ text += pytesseract.image_to_string(image)
177
+
178
  return text
179
 
180
+
181
  def save_feedback(question, answer, rating, options, context):
182
  feedback_file = 'question_feedback.json'
183
  if os.path.exists(feedback_file):