Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -34,6 +34,7 @@ import uuid
|
|
34 |
import time
|
35 |
import asyncio
|
36 |
import aiohttp
|
|
|
37 |
# '-----------------'
|
38 |
import smtplib
|
39 |
from email.mime.multipart import MIMEMultipart
|
@@ -157,25 +158,70 @@ def display_info():
|
|
157 |
# page = doc.load_page(page_num)
|
158 |
# text += page.get_text()
|
159 |
# return text
|
160 |
-
# subprocess.run(["git", "clone", "https://github.com/tesseract-ocr/tesseract.git"])
|
161 |
-
def get_pdf_text(pdf_file):
|
162 |
-
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
|
163 |
-
text = ""
|
164 |
|
165 |
-
|
166 |
-
|
167 |
-
|
|
|
|
|
|
|
|
|
168 |
|
169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
image_list = page.get_images(full=True)
|
|
|
171 |
for img_index, img in enumerate(image_list):
|
172 |
xref = img[0]
|
173 |
-
base_image =
|
174 |
image_bytes = base_image["image"]
|
|
|
175 |
image = Image.open(io.BytesIO(image_bytes))
|
176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
177 |
|
178 |
-
return text
|
179 |
|
180 |
|
181 |
def save_feedback(question, answer, rating, options, context):
|
@@ -571,7 +617,13 @@ def main():
|
|
571 |
file = st.file_uploader("Upload PDF Files")
|
572 |
if file is not None:
|
573 |
try:
|
574 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
575 |
except Exception as e:
|
576 |
st.error(f"Error reading PDF file: {str(e)}")
|
577 |
text = None
|
|
|
34 |
import time
|
35 |
import asyncio
|
36 |
import aiohttp
|
37 |
+
import easyocr
|
38 |
# '-----------------'
|
39 |
import smtplib
|
40 |
from email.mime.multipart import MIMEMultipart
|
|
|
158 |
# page = doc.load_page(page_num)
|
159 |
# text += page.get_text()
|
160 |
# return text
|
|
|
|
|
|
|
|
|
161 |
|
162 |
+
# def get_pdf_text(pdf_file):
|
163 |
+
# doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
|
164 |
+
# text = ""
|
165 |
+
|
166 |
+
# for page_num in range(doc.page_count):
|
167 |
+
# page = doc.load_page(page_num)
|
168 |
+
# text += page.get_text()
|
169 |
|
170 |
+
# # Extract images from the page
|
171 |
+
# image_list = page.get_images(full=True)
|
172 |
+
# for img_index, img in enumerate(image_list):
|
173 |
+
# xref = img[0]
|
174 |
+
# base_image = doc.extract_image(xref)
|
175 |
+
# image_bytes = base_image["image"]
|
176 |
+
# image = Image.open(io.BytesIO(image_bytes))
|
177 |
+
# text += pytesseract.image_to_string(image)
|
178 |
+
|
179 |
+
# return text
|
180 |
+
|
181 |
+
|
182 |
+
def extract_images_from_pdf(pdf_path):
|
183 |
+
"""Extract images from the given PDF file."""
|
184 |
+
pdf_file = fitz.open(pdf_path)
|
185 |
+
images = []
|
186 |
+
|
187 |
+
for page_index in range(len(pdf_file)):
|
188 |
+
page = pdf_file.load_page(page_index)
|
189 |
image_list = page.get_images(full=True)
|
190 |
+
|
191 |
for img_index, img in enumerate(image_list):
|
192 |
xref = img[0]
|
193 |
+
base_image = pdf_file.extract_image(xref)
|
194 |
image_bytes = base_image["image"]
|
195 |
+
image_ext = base_image["ext"]
|
196 |
image = Image.open(io.BytesIO(image_bytes))
|
197 |
+
images.append(image)
|
198 |
+
|
199 |
+
return images
|
200 |
+
|
201 |
+
def recognize_text(image):
|
202 |
+
"""Recognize text from a single image."""
|
203 |
+
reader = easyocr.Reader(['en'])
|
204 |
+
result = reader.readtext(image)
|
205 |
+
|
206 |
+
recognized_text = ""
|
207 |
+
for (bbox, text, prob) in result:
|
208 |
+
if prob > 0.2:
|
209 |
+
recognized_text += f'{text}\n'
|
210 |
+
|
211 |
+
return recognized_text
|
212 |
+
|
213 |
+
def ocr_text_from_pdf(pdf_path):
|
214 |
+
"""Extract text from all images in the PDF."""
|
215 |
+
images = extract_images_from_pdf(pdf_path)
|
216 |
+
all_text = ""
|
217 |
+
|
218 |
+
for image in images:
|
219 |
+
text = recognize_text(image)
|
220 |
+
all_text += text
|
221 |
+
|
222 |
+
return all_text
|
223 |
+
|
224 |
|
|
|
225 |
|
226 |
|
227 |
def save_feedback(question, answer, rating, options, context):
|
|
|
617 |
file = st.file_uploader("Upload PDF Files")
|
618 |
if file is not None:
|
619 |
try:
|
620 |
+
# pdf_path = "path/to/your/pdf_file.pdf"
|
621 |
+
|
622 |
+
# Extract text from the PDF
|
623 |
+
text = ocr_text_from_pdf(file)
|
624 |
+
# print(extracted_text)
|
625 |
+
|
626 |
+
# text = get_pdf_text(file)
|
627 |
except Exception as e:
|
628 |
st.error(f"Error reading PDF file: {str(e)}")
|
629 |
text = None
|