Spaces:
Sleeping
Sleeping
Jay Valaki
commited on
app.py
Browse filesThe implementation of Object Character Recognition (OCR).
app.py
CHANGED
@@ -26,6 +26,10 @@ from spellchecker import SpellChecker
|
|
26 |
from transformers import pipeline
|
27 |
import re
|
28 |
import pymupdf
|
|
|
|
|
|
|
|
|
29 |
import uuid
|
30 |
import time
|
31 |
import asyncio
|
@@ -146,14 +150,34 @@ def display_info():
|
|
146 |
|
147 |
""")
|
148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
def get_pdf_text(pdf_file):
|
150 |
-
doc =
|
151 |
text = ""
|
|
|
152 |
for page_num in range(doc.page_count):
|
153 |
page = doc.load_page(page_num)
|
154 |
text += page.get_text()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
return text
|
156 |
|
|
|
157 |
def save_feedback(question, answer, rating, options, context):
|
158 |
feedback_file = 'question_feedback.json'
|
159 |
if os.path.exists(feedback_file):
|
|
|
26 |
from transformers import pipeline
|
27 |
import re
|
28 |
import pymupdf
|
29 |
+
import fitz # PyMuPDF
|
30 |
+
import pytesseract
|
31 |
+
from PIL import Image
|
32 |
+
import io
|
33 |
import uuid
|
34 |
import time
|
35 |
import asyncio
|
|
|
150 |
|
151 |
""")
|
152 |
|
153 |
+
# def get_pdf_text(pdf_file):
|
154 |
+
# doc = pymupdf.open(stream=pdf_file.read(), filetype="pdf")
|
155 |
+
# text = ""
|
156 |
+
# for page_num in range(doc.page_count):
|
157 |
+
# page = doc.load_page(page_num)
|
158 |
+
# text += page.get_text()
|
159 |
+
# return text
|
160 |
+
|
161 |
def get_pdf_text(pdf_file):
|
162 |
+
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
|
163 |
text = ""
|
164 |
+
|
165 |
for page_num in range(doc.page_count):
|
166 |
page = doc.load_page(page_num)
|
167 |
text += page.get_text()
|
168 |
+
|
169 |
+
# Extract images from the page
|
170 |
+
image_list = page.get_images(full=True)
|
171 |
+
for img_index, img in enumerate(image_list):
|
172 |
+
xref = img[0]
|
173 |
+
base_image = doc.extract_image(xref)
|
174 |
+
image_bytes = base_image["image"]
|
175 |
+
image = Image.open(io.BytesIO(image_bytes))
|
176 |
+
text += pytesseract.image_to_string(image)
|
177 |
+
|
178 |
return text
|
179 |
|
180 |
+
|
181 |
def save_feedback(question, answer, rating, options, context):
|
182 |
feedback_file = 'question_feedback.json'
|
183 |
if os.path.exists(feedback_file):
|