Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -17,6 +17,10 @@ import shap
|
|
17 |
import pdfplumber
|
18 |
import pytesseract
|
19 |
from pdf2image import convert_from_path
|
|
|
|
|
|
|
|
|
20 |
|
21 |
# Machine Learning
|
22 |
from sklearn.impute import SimpleImputer
|
@@ -55,16 +59,14 @@ from transformers import TFBertForSequenceClassification
|
|
55 |
# --------------------------
|
56 |
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' # Update this path if Tesseract is installed elsewhere
|
57 |
|
58 |
-
def extract_text_from_pdf(
|
|
|
59 |
text = ""
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
with pdfplumber.open(pdf_path) as pdf:
|
66 |
-
for page in pdf.pages:
|
67 |
-
text += page.extract_text()
|
68 |
return text
|
69 |
|
70 |
|
@@ -347,30 +349,9 @@ def prediction_input_form(features, default_values=None):
|
|
347 |
input_data[feature] = st.number_input(f"{feature}:", value=default_value)
|
348 |
return input_data
|
349 |
|
350 |
-
# Enhanced Helper Functions
|
351 |
-
def extract_text_from_pdf(pdf_file, use_ocr=False):
|
352 |
-
"""Extract text with OCR support"""
|
353 |
-
try:
|
354 |
-
import pdfplumber
|
355 |
-
with pdfplumber.open(pdf_file) as pdf:
|
356 |
-
text = "\n".join([page.extract_text() for page in pdf.pages])
|
357 |
-
|
358 |
-
if use_ocr or len(text) < 50: # Fallback to OCR
|
359 |
-
import fitz # PyMuPDF
|
360 |
-
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
|
361 |
-
text = ""
|
362 |
-
for page in doc:
|
363 |
-
text += page.get_text("text")
|
364 |
-
if len(text) < 50:
|
365 |
-
raise ValueError("Likely scanned document - enable OCR")
|
366 |
-
return text
|
367 |
-
except Exception as e:
|
368 |
-
raise RuntimeError(f"Text extraction failed: {str(e)}")
|
369 |
|
370 |
def visualize_entities(text):
|
371 |
"""Create interactive entity visualization"""
|
372 |
-
import spacy
|
373 |
-
from spacy import displacy
|
374 |
nlp = spacy.load("en_core_web_sm")
|
375 |
doc = nlp(text)
|
376 |
html = displacy.render(doc, style="ent", page=True)
|
@@ -384,7 +365,6 @@ def generate_embeddings(text):
|
|
384 |
|
385 |
def extract_metadata(pdf_file):
|
386 |
"""Extract PDF metadata"""
|
387 |
-
import fitz
|
388 |
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
|
389 |
return {
|
390 |
"author": doc.metadata.get("author"),
|
|
|
17 |
import pdfplumber
|
18 |
import pytesseract
|
19 |
from pdf2image import convert_from_path
|
20 |
+
import fitz
|
21 |
+
import spacy
|
22 |
+
from spacy import displacy
|
23 |
+
from sentence_transformers import SentenceTransformer
|
24 |
|
25 |
# Machine Learning
|
26 |
from sklearn.impute import SimpleImputer
|
|
|
59 |
# --------------------------
|
60 |
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' # Update this path if Tesseract is installed elsewhere
|
61 |
|
62 |
+
def extract_text_from_pdf(pdf, ocr_enabled):
|
63 |
+
pages = convert_from_path(pdf, 500)
|
64 |
text = ""
|
65 |
+
for page in pages:
|
66 |
+
if ocr_enabled:
|
67 |
+
text += pytesseract.image_to_string(page)
|
68 |
+
else:
|
69 |
+
text += page.get_text()
|
|
|
|
|
|
|
70 |
return text
|
71 |
|
72 |
|
|
|
349 |
input_data[feature] = st.number_input(f"{feature}:", value=default_value)
|
350 |
return input_data
|
351 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
352 |
|
353 |
def visualize_entities(text):
|
354 |
"""Create interactive entity visualization"""
|
|
|
|
|
355 |
nlp = spacy.load("en_core_web_sm")
|
356 |
doc = nlp(text)
|
357 |
html = displacy.render(doc, style="ent", page=True)
|
|
|
365 |
|
366 |
def extract_metadata(pdf_file):
|
367 |
"""Extract PDF metadata"""
|
|
|
368 |
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
|
369 |
return {
|
370 |
"author": doc.metadata.get("author"),
|