CosmickVisions commited on
Commit
5b5d383
·
verified ·
1 Parent(s): c5f2730

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -31
app.py CHANGED
@@ -17,6 +17,10 @@ import shap
17
  import pdfplumber
18
  import pytesseract
19
  from pdf2image import convert_from_path
 
 
 
 
20
 
21
  # Machine Learning
22
  from sklearn.impute import SimpleImputer
@@ -55,16 +59,14 @@ from transformers import TFBertForSequenceClassification
55
  # --------------------------
56
  pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' # Update this path if Tesseract is installed elsewhere
57
 
58
- def extract_text_from_pdf(pdf_path, ocr_enabled=False):
 
59
  text = ""
60
- if ocr_enabled:
61
- images = convert_from_path(pdf_path)
62
- for image in images:
63
- text += pytesseract.image_to_string(image)
64
- else:
65
- with pdfplumber.open(pdf_path) as pdf:
66
- for page in pdf.pages:
67
- text += page.extract_text()
68
  return text
69
 
70
 
@@ -347,30 +349,9 @@ def prediction_input_form(features, default_values=None):
347
  input_data[feature] = st.number_input(f"{feature}:", value=default_value)
348
  return input_data
349
 
350
- # Enhanced Helper Functions
351
- def extract_text_from_pdf(pdf_file, use_ocr=False):
352
- """Extract text with OCR support"""
353
- try:
354
- import pdfplumber
355
- with pdfplumber.open(pdf_file) as pdf:
356
- text = "\n".join([page.extract_text() for page in pdf.pages])
357
-
358
- if use_ocr or len(text) < 50: # Fallback to OCR
359
- import fitz # PyMuPDF
360
- doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
361
- text = ""
362
- for page in doc:
363
- text += page.get_text("text")
364
- if len(text) < 50:
365
- raise ValueError("Likely scanned document - enable OCR")
366
- return text
367
- except Exception as e:
368
- raise RuntimeError(f"Text extraction failed: {str(e)}")
369
 
370
  def visualize_entities(text):
371
  """Create interactive entity visualization"""
372
- import spacy
373
- from spacy import displacy
374
  nlp = spacy.load("en_core_web_sm")
375
  doc = nlp(text)
376
  html = displacy.render(doc, style="ent", page=True)
@@ -384,7 +365,6 @@ def generate_embeddings(text):
384
 
385
  def extract_metadata(pdf_file):
386
  """Extract PDF metadata"""
387
- import fitz
388
  doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
389
  return {
390
  "author": doc.metadata.get("author"),
 
17
  import pdfplumber
18
  import pytesseract
19
  from pdf2image import convert_from_path
20
+ import fitz
21
+ import spacy
22
+ from spacy import displacy
23
+ from sentence_transformers import SentenceTransformer
24
 
25
  # Machine Learning
26
  from sklearn.impute import SimpleImputer
 
59
  # --------------------------
60
  pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' # Update this path if Tesseract is installed elsewhere
61
 
62
+ def extract_text_from_pdf(pdf, ocr_enabled):
63
+ pages = convert_from_path(pdf, 500)
64
  text = ""
65
+ for page in pages:
66
+ if ocr_enabled:
67
+ text += pytesseract.image_to_string(page)
68
+ else:
69
+ text += page.get_text()
 
 
 
70
  return text
71
 
72
 
 
349
  input_data[feature] = st.number_input(f"{feature}:", value=default_value)
350
  return input_data
351
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
 
353
  def visualize_entities(text):
354
  """Create interactive entity visualization"""
 
 
355
  nlp = spacy.load("en_core_web_sm")
356
  doc = nlp(text)
357
  html = displacy.render(doc, style="ent", page=True)
 
365
 
366
  def extract_metadata(pdf_file):
367
  """Extract PDF metadata"""
 
368
  doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
369
  return {
370
  "author": doc.metadata.get("author"),