Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -16,10 +16,11 @@ import shap
|
|
16 |
# PDF and OCR Processing
|
17 |
import pdfplumber
|
18 |
import pytesseract
|
19 |
-
from pdf2image import
|
20 |
import spacy
|
21 |
from spacy import displacy
|
22 |
from sentence_transformers import SentenceTransformer
|
|
|
23 |
|
24 |
# Machine Learning
|
25 |
from sklearn.impute import SimpleImputer
|
@@ -58,17 +59,25 @@ from transformers import TFBertForSequenceClassification
|
|
58 |
# --------------------------
|
59 |
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' # Update this path if Tesseract is installed elsewhere
|
60 |
|
61 |
-
|
62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
text = ""
|
64 |
for page in pages:
|
65 |
if ocr_enabled:
|
66 |
text += pytesseract.image_to_string(page)
|
67 |
else:
|
68 |
-
|
|
|
69 |
return text
|
70 |
|
71 |
|
|
|
72 |
def enhance_section_title(title, icon="✨"):
|
73 |
"""Helper function to create a styled section title with an icon."""
|
74 |
st.markdown(f"<h2 style='border-bottom: 2px solid #ccc; padding-bottom: 5px;'>{icon} {title}</h2>", unsafe_allow_html=True)
|
|
|
16 |
# PDF and OCR Processing
|
17 |
import pdfplumber
|
18 |
import pytesseract
|
19 |
+
from pdf2image import convert_from_bytes
|
20 |
import spacy
|
21 |
from spacy import displacy
|
22 |
from sentence_transformers import SentenceTransformer
|
23 |
+
import io
|
24 |
|
25 |
# Machine Learning
|
26 |
from sklearn.impute import SimpleImputer
|
|
|
59 |
# --------------------------
|
60 |
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' # Update this path if Tesseract is installed elsewhere
|
61 |
|
62 |
+
|
63 |
+
def extract_text_from_pdf(pdf_file, ocr_enabled):
|
64 |
+
# Convert the uploaded file to bytes and read it
|
65 |
+
pdf_bytes = pdf_file.read()
|
66 |
+
|
67 |
+
# Convert the PDF bytes to images
|
68 |
+
pages = convert_from_bytes(pdf_bytes, 500)
|
69 |
+
|
70 |
text = ""
|
71 |
for page in pages:
|
72 |
if ocr_enabled:
|
73 |
text += pytesseract.image_to_string(page)
|
74 |
else:
|
75 |
+
# As we don't have a 'get_text()' method for images, use pytesseract
|
76 |
+
text += pytesseract.image_to_string(page)
|
77 |
return text
|
78 |
|
79 |
|
80 |
+
|
81 |
def enhance_section_title(title, icon="✨"):
|
82 |
"""Helper function to create a styled section title with an icon."""
|
83 |
st.markdown(f"<h2 style='border-bottom: 2px solid #ccc; padding-bottom: 5px;'>{icon} {title}</h2>", unsafe_allow_html=True)
|