CosmickVisions commited on
Commit
ab9a20c
·
verified ·
1 Parent(s): af7ec3d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -4
app.py CHANGED
@@ -16,10 +16,11 @@ import shap
16
  # PDF and OCR Processing
17
  import pdfplumber
18
  import pytesseract
19
- from pdf2image import convert_from_path
20
  import spacy
21
  from spacy import displacy
22
  from sentence_transformers import SentenceTransformer
 
23
 
24
  # Machine Learning
25
  from sklearn.impute import SimpleImputer
@@ -58,17 +59,25 @@ from transformers import TFBertForSequenceClassification
58
  # --------------------------
59
  pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' # Update this path if Tesseract is installed elsewhere
60
 
61
- def extract_text_from_pdf(pdf, ocr_enabled):
62
- pages = convert_from_path(pdf, 500)
 
 
 
 
 
 
63
  text = ""
64
  for page in pages:
65
  if ocr_enabled:
66
  text += pytesseract.image_to_string(page)
67
  else:
68
- text += page.get_text()
 
69
  return text
70
 
71
 
 
72
  def enhance_section_title(title, icon="✨"):
73
  """Helper function to create a styled section title with an icon."""
74
  st.markdown(f"<h2 style='border-bottom: 2px solid #ccc; padding-bottom: 5px;'>{icon} {title}</h2>", unsafe_allow_html=True)
 
16
  # PDF and OCR Processing
17
  import pdfplumber
18
  import pytesseract
19
+ from pdf2image import convert_from_bytes
20
  import spacy
21
  from spacy import displacy
22
  from sentence_transformers import SentenceTransformer
23
+ import io
24
 
25
  # Machine Learning
26
  from sklearn.impute import SimpleImputer
 
59
  # --------------------------
60
  pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' # Update this path if Tesseract is installed elsewhere
61
 
62
+
63
+ def extract_text_from_pdf(pdf_file, ocr_enabled):
64
+ # Convert the uploaded file to bytes and read it
65
+ pdf_bytes = pdf_file.read()
66
+
67
+ # Convert the PDF bytes to images
68
+ pages = convert_from_bytes(pdf_bytes, 500)
69
+
70
  text = ""
71
  for page in pages:
72
  if ocr_enabled:
73
  text += pytesseract.image_to_string(page)
74
  else:
75
+ # As we don't have a 'get_text()' method for images, use pytesseract
76
+ text += pytesseract.image_to_string(page)
77
  return text
78
 
79
 
80
+
81
  def enhance_section_title(title, icon="✨"):
82
  """Helper function to create a styled section title with an icon."""
83
  st.markdown(f"<h2 style='border-bottom: 2px solid #ccc; padding-bottom: 5px;'>{icon} {title}</h2>", unsafe_allow_html=True)