Spaces:

CosmickVisions
/

Data-Vision

Running

CosmickVisions commited on Mar 3

Commit

ab9a20c

verified ·

1 Parent(s): af7ec3d

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -16,10 +16,11 @@ import shap
 # PDF and OCR Processing
 import pdfplumber
 import pytesseract
-from pdf2image import convert_from_path
 import spacy
 from spacy import displacy
 from sentence_transformers import SentenceTransformer
 # Machine Learning
 from sklearn.impute import SimpleImputer
@@ -58,17 +59,25 @@ from transformers import TFBertForSequenceClassification
 # --------------------------
 pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'  # Update this path if Tesseract is installed elsewhere
-def extract_text_from_pdf(pdf, ocr_enabled):
-    pages = convert_from_path(pdf, 500)
     text = ""
     for page in pages:
         if ocr_enabled:
             text += pytesseract.image_to_string(page)
         else:
-            text += page.get_text()
     return text
 def enhance_section_title(title, icon="✨"):
     """Helper function to create a styled section title with an icon."""
     st.markdown(f"<h2 style='border-bottom: 2px solid #ccc; padding-bottom: 5px;'>{icon} {title}</h2>", unsafe_allow_html=True)

 # PDF and OCR Processing
 import pdfplumber
 import pytesseract
+from pdf2image import convert_from_bytes
 import spacy
 from spacy import displacy
 from sentence_transformers import SentenceTransformer
+import io
 # Machine Learning
 from sklearn.impute import SimpleImputer
 # --------------------------
 pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'  # Update this path if Tesseract is installed elsewhere
+def extract_text_from_pdf(pdf_file, ocr_enabled):
+    # Convert the uploaded file to bytes and read it
+    pdf_bytes = pdf_file.read()
+    # Convert the PDF bytes to images
+    pages = convert_from_bytes(pdf_bytes, 500)
     text = ""
     for page in pages:
         if ocr_enabled:
             text += pytesseract.image_to_string(page)
         else:
+            # As we don't have a 'get_text()' method for images, use pytesseract
+            text += pytesseract.image_to_string(page)
     return text
 def enhance_section_title(title, icon="✨"):
     """Helper function to create a styled section title with an icon."""
     st.markdown(f"<h2 style='border-bottom: 2px solid #ccc; padding-bottom: 5px;'>{icon} {title}</h2>", unsafe_allow_html=True)