Spaces:

CR7CAD
/

ISOM5240FinalProject

Sleeping

App Files Files Community

CR7CAD commited on Mar 16

Commit

08361f0

verified ·

1 Parent(s): 9753cc9

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -15

app.py CHANGED Viewed

@@ -5,11 +5,11 @@ from PIL import Image
 from transformers import pipeline
 from pdfminer.high_level import extract_text
-# Load and cache the OCR model once at startup
 @st.cache_resource(show_spinner=False)
 def load_ocr_pipeline():
     try:
-        # Ensure your transformers library is up-to-date (>=4.x)
         ocr_pipe = pipeline("image-to-text", model="YouLiXiya/tinyllava-v1.0-1.1b-hf")
         return ocr_pipe
     except Exception as e:
@@ -20,7 +20,7 @@ ocr_pipeline = load_ocr_pipeline()
 st.write("Model loaded successfully!")
 #####################################
-# Extract Text from File Function
 #####################################
 def extract_text_from_file(file_obj):
     full_text = ""
@@ -43,10 +43,17 @@ def extract_text_from_file(file_obj):
     return full_text
 #####################################
-# Information Extraction Functions
 #####################################
 def extract_basic_resume_info(text):
-    """Extract basic resume info: Name, Age, Job Experience, Skills, Expected Industry/Direction."""
     info = {
         "Name": None,
         "Age": None,
@@ -55,22 +62,22 @@ def extract_basic_resume_info(text):
         "Expected Industry/Direction": None,
     }
-    # Extract name (e.g., "Name: John Doe")
     name_match = re.search(r"[Nn]ame[:\-]\s*([A-Za-z\s]+)", text)
     if name_match:
         info["Name"] = name_match.group(1).strip()
     else:
-        # Heuristic: pick the first sequence of capitalized words
         potential_names = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b', text)
         if potential_names:
             info["Name"] = potential_names[0]
-    # Extract age
     age_match = re.search(r"[Aa]ge[:\-]\s*(\d{1,2})", text)
     if age_match:
         info["Age"] = age_match.group(1)
-    # Extract job experience (years)
     exp_match = re.search(r"(\d+)\s+(?:years|yrs)\s+(?:of\s+)?experience", text, re.IGNORECASE)
     if exp_match:
         info["Job Experience"] = exp_match.group(1) + " years"
@@ -79,14 +86,14 @@ def extract_basic_resume_info(text):
         if exp_line:
             info["Job Experience"] = exp_line.group(2).strip()
-    # Extract skills (e.g., "Skills: Python, Java, SQL")
     skills_match = re.search(r"[Ss]kills[:\-]\s*(.+)", text)
     if skills_match:
         skills_text = skills_match.group(1)
         skills = [s.strip() for s in re.split(r",|\n", skills_text) if s.strip()]
         info["Skills"] = skills
-    # Extract expected industry/direction
     industry_match = re.search(r"(Industry|Interest|Direction)[:\-]\s*(.+)", text, re.IGNORECASE)
     if industry_match:
         info["Expected Industry/Direction"] = industry_match.group(2).strip()
@@ -94,15 +101,15 @@ def extract_basic_resume_info(text):
     return info
 #####################################
-# Main Processing Logic
 #####################################
 def process_resume(file_obj):
     if file_obj is None:
         return None, None
-    # Extract text based on file type (PDF or image)
     resume_text = extract_text_from_file(file_obj)
-    # Parse basic resume info using heuristics
     resume_info = extract_basic_resume_info(resume_text)
     return resume_text, resume_info
@@ -111,7 +118,7 @@ def process_resume(file_obj):
 #####################################
 st.title("Resume Extraction and Basic Info Parsing")
 st.markdown("""
-Upload a resume file (PDF, PNG, JPG, or JPEG) to extract basic text and candidate information.
 """)
 uploaded_file = st.file_uploader("Upload Resume", type=["pdf", "png", "jpg", "jpeg"])

 from transformers import pipeline
 from pdfminer.high_level import extract_text
+# Load and cache the OCR model once at startup.
 @st.cache_resource(show_spinner=False)
 def load_ocr_pipeline():
     try:
+        # Initialize the OCR pipeline from transformers. Change the model as needed.
         ocr_pipe = pipeline("image-to-text", model="YouLiXiya/tinyllava-v1.0-1.1b-hf")
         return ocr_pipe
     except Exception as e:
 st.write("Model loaded successfully!")
 #####################################
+# Text Extraction Function
 #####################################
 def extract_text_from_file(file_obj):
     full_text = ""
     return full_text
 #####################################
+# Resume Information Extraction Functions
 #####################################
 def extract_basic_resume_info(text):
+    """
+    Extract basic resume information, such as:
+    - Name
+    - Age
+    - Job Experience
+    - Skills
+    - Expected Industry/Direction
+    """
     info = {
         "Name": None,
         "Age": None,
         "Expected Industry/Direction": None,
     }
+    # Extract Name: trigger words like 'Name:'
     name_match = re.search(r"[Nn]ame[:\-]\s*([A-Za-z\s]+)", text)
     if name_match:
         info["Name"] = name_match.group(1).strip()
     else:
+        # Fallback: heuristic for sequences of capitalized words.
         potential_names = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b', text)
         if potential_names:
             info["Name"] = potential_names[0]
+    # Extract Age:
     age_match = re.search(r"[Aa]ge[:\-]\s*(\d{1,2})", text)
     if age_match:
         info["Age"] = age_match.group(1)
+    # Extract Job Experience (years)
     exp_match = re.search(r"(\d+)\s+(?:years|yrs)\s+(?:of\s+)?experience", text, re.IGNORECASE)
     if exp_match:
         info["Job Experience"] = exp_match.group(1) + " years"
         if exp_line:
             info["Job Experience"] = exp_line.group(2).strip()
+    # Extract Skills (e.g., "Skills: Python, Java, SQL")
     skills_match = re.search(r"[Ss]kills[:\-]\s*(.+)", text)
     if skills_match:
         skills_text = skills_match.group(1)
         skills = [s.strip() for s in re.split(r",|\n", skills_text) if s.strip()]
         info["Skills"] = skills
+    # Extract Expected Industry/Direction (e.g., "Interest: Software Development")
     industry_match = re.search(r"(Industry|Interest|Direction)[:\-]\s*(.+)", text, re.IGNORECASE)
     if industry_match:
         info["Expected Industry/Direction"] = industry_match.group(2).strip()
     return info
 #####################################
+# Main Resume Processing Logic
 #####################################
 def process_resume(file_obj):
     if file_obj is None:
         return None, None
+    # Extract text based on file type (PDF or image).
     resume_text = extract_text_from_file(file_obj)
+    # Parse basic resume details from the extracted text.
     resume_info = extract_basic_resume_info(resume_text)
     return resume_text, resume_info
 #####################################
 st.title("Resume Extraction and Basic Info Parsing")
 st.markdown("""
+Upload a resume file in PDF or image format (PNG, JPG, JPEG) to extract text and candidate information.
 """)
 uploaded_file = st.file_uploader("Upload Resume", type=["pdf", "png", "jpg", "jpeg"])