Spaces:

CR7CAD
/

ISOM5240FinalProject

Sleeping

App Files Files Community

CR7CAD commited on Mar 16

Commit

9753cc9

verified ·

1 Parent(s): 294af95

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -18

app.py CHANGED Viewed

@@ -1,13 +1,11 @@
 import os
 import re
-import torch  # Explicitly imported if you want to use torch directly
-from io import BytesIO
 import streamlit as st
 from PIL import Image
 from transformers import pipeline
-# Use st.cache_resource (Streamlit 1.18+) to load and cache the OCR pipeline once
 @st.cache_resource(show_spinner=False)
 def load_ocr_pipeline():
     try:
@@ -18,22 +16,30 @@ def load_ocr_pipeline():
         st.error(f"Error loading model: {e}")
         st.stop()
-# Load the model at startup
 ocr_pipeline = load_ocr_pipeline()
 st.write("Model loaded successfully!")
 #####################################
-# Pipeline: Extract Text with OCR Pipeline
 #####################################
 def extract_text_from_file(file_obj):
     full_text = ""
-    try:
-        img = Image.open(file_obj)
-        result = ocr_pipeline(img)
-        if isinstance(result, list) and "text" in result[0]:
-            full_text = result[0]["text"]
-    except Exception as e:
-        full_text = f"Error processing image: {e}"
     return full_text
 #####################################
@@ -94,9 +100,9 @@ def process_resume(file_obj):
     if file_obj is None:
         return None, None
-    # Extract text using only the image-based OCR pipeline
     resume_text = extract_text_from_file(file_obj)
-    # Parse basic resume info
     resume_info = extract_basic_resume_info(resume_text)
     return resume_text, resume_info
@@ -105,14 +111,14 @@ def process_resume(file_obj):
 #####################################
 st.title("Resume Extraction and Basic Info Parsing")
 st.markdown("""
-Upload an image file (PNG, JPG, or JPEG) to extract basic text and candidate information.
 """)
-uploaded_file = st.file_uploader("Upload Resume (Image Only)", type=["png", "jpg", "jpeg"])
 if st.button("Extract Info"):
     if uploaded_file is None:
-        st.error("Please upload an image file first.")
     else:
         with st.spinner("Processing..."):
             resume_text, resume_info = process_resume(uploaded_file)

 import os
 import re
 import streamlit as st
 from PIL import Image
 from transformers import pipeline
+from pdfminer.high_level import extract_text
+# Load and cache the OCR model once at startup
 @st.cache_resource(show_spinner=False)
 def load_ocr_pipeline():
     try:
         st.error(f"Error loading model: {e}")
         st.stop()
 ocr_pipeline = load_ocr_pipeline()
 st.write("Model loaded successfully!")
 #####################################
+# Extract Text from File Function
 #####################################
 def extract_text_from_file(file_obj):
     full_text = ""
+    file_extension = os.path.splitext(file_obj.name)[1].lower()
+    if file_extension == ".pdf":
+        try:
+            # Use pdfminer.six to extract text from PDF files.
+            full_text = extract_text(file_obj)
+        except Exception as e:
+            full_text = f"Error processing PDF: {e}"
+    else:
+        try:
+            img = Image.open(file_obj)
+            result = ocr_pipeline(img)
+            if isinstance(result, list) and "text" in result[0]:
+                full_text = result[0]["text"]
+        except Exception as e:
+            full_text = f"Error processing image: {e}"
     return full_text
 #####################################
     if file_obj is None:
         return None, None
+    # Extract text based on file type (PDF or image)
     resume_text = extract_text_from_file(file_obj)
+    # Parse basic resume info using heuristics
     resume_info = extract_basic_resume_info(resume_text)
     return resume_text, resume_info
 #####################################
 st.title("Resume Extraction and Basic Info Parsing")
 st.markdown("""
+Upload a resume file (PDF, PNG, JPG, or JPEG) to extract basic text and candidate information.
 """)
+uploaded_file = st.file_uploader("Upload Resume", type=["pdf", "png", "jpg", "jpeg"])
 if st.button("Extract Info"):
     if uploaded_file is None:
+        st.error("Please upload a file first.")
     else:
         with st.spinner("Processing..."):
             resume_text, resume_info = process_resume(uploaded_file)