Spaces:

Manojajj
/

bert-base-cased-resume_parser

Sleeping

App Files Files Community

Manojajj commited on Nov 17, 2024

Commit

18b1dee

verified ·

1 Parent(s): b5578a6

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -35

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ import os
 # Function to login using Hugging Face API token
 def login_with_token(hf_token):
-    """Login to Hugging Face using provided token"""
     try:
         login(token=hf_token)
         return "Logged in successfully!"
@@ -19,46 +19,51 @@ def login_with_token(hf_token):
 nlp = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", framework="pt")
 def extract_text_from_pdf(pdf_file):
-    """Extracts text from a PDF file using pdfplumber"""
-    with pdfplumber.open(pdf_file.name) as pdf:
-        text = ''
-        for page in pdf.pages:
-            text += page.extract_text()
-    return text
 def parse_resume(pdf_file):
-    """Parses the resume and extracts relevant information"""
     # Extract text from PDF
     resume_text = extract_text_from_pdf(pdf_file)
-    # Log the extracted text for debugging
-    print("Extracted Text from Resume:")
-    print(resume_text[:500])  # Print the first 500 characters for preview
     # Use the NER model to identify entities in the resume
     entities = nlp(resume_text)
-    # Log the NER output for debugging
-    print("NER Output:")
-    print(entities)
     # Initialize empty fields
     name = email = phone = education = skills = experience = None
     # Example parsing logic based on NER output
     for entity in entities:
-        # Check if 'label' key exists in the entity to avoid KeyError
-        if 'label' in entity:
-            if entity['label'] == 'PER':
-                name = entity['word']  # If detected, use the first person name
-            elif entity['label'] == 'ORG':
-                experience = entity['word']  # Could be an organization name (e.g., employer)
-            elif entity['label'] == 'EMAIL':
-                email = entity['word']
-            elif entity['label'] == 'MISC':
-                skills = entity['word']  # Example for skills or qualifications
-    # Log the final parsed information for debugging
     print(f"Parsed Info: Name={name}, Email={email}, Skills={skills}, Experience={experience}")
     return {
@@ -71,22 +76,23 @@ def parse_resume(pdf_file):
     }
 def batch_process_resumes(pdf_files):
-    """Process a batch of resume PDFs and output in a DataFrame"""
     all_resumes = []
     for pdf_file in pdf_files:
         resume_info = parse_resume(pdf_file)
         # Only add the parsed resume info if there's meaningful data
-        if any(resume_info.values()):  # Skip empty resume entries
             all_resumes.append(resume_info)
     # Convert to DataFrame
     df = pd.DataFrame(all_resumes)
-    # If the DataFrame is empty, return a message indicating no data was found
-    if df.empty:
-        return "No valid resume information was parsed."
     # Define the file path for the Excel file
     output_file = "/tmp/parsed_resumes.xlsx"
@@ -120,10 +126,13 @@ with gr.Blocks() as demo:
         # Attempt to log in with provided token
         login_message = login_with_token(hf_token)
-        # If login is successful, process resumes and generate the download link
         if "Error" not in login_message:
             excel_file_path = batch_process_resumes(pdf_files)
-            return login_message + "\nExcel file with parsed resumes is ready for download.", excel_file_path
         else:
             return login_message, None

 # Function to login using Hugging Face API token
 def login_with_token(hf_token):
+    """Login to Hugging Face using provided token."""
     try:
         login(token=hf_token)
         return "Logged in successfully!"
 nlp = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", framework="pt")
 def extract_text_from_pdf(pdf_file):
+    """Extracts text from a PDF file using pdfplumber."""
+    try:
+        with pdfplumber.open(pdf_file.name) as pdf:
+            text = ''
+            for page in pdf.pages:
+                text += page.extract_text() or ""  # Handle pages with no text
+        return text
+    except Exception as e:
+        print(f"Error reading PDF: {e}")
+        return ""
 def parse_resume(pdf_file):
+    """Parses the resume and extracts relevant information."""
     # Extract text from PDF
     resume_text = extract_text_from_pdf(pdf_file)
+    if not resume_text.strip():
+        print("No text found in PDF.")
+        return {}
     # Use the NER model to identify entities in the resume
     entities = nlp(resume_text)
     # Initialize empty fields
     name = email = phone = education = skills = experience = None
     # Example parsing logic based on NER output
     for entity in entities:
+        label = entity.get("entity", "")
+        word = entity.get("word", "").strip()
+        if label == "B-PER" or label == "I-PER":
+            name = (name or "") + word + " "
+        elif label == "B-ORG" or label == "I-ORG":
+            experience = (experience or "") + word + " "
+        elif "@" in word:  # Simple email detection
+            email = word
+        elif label == "I-MISC":
+            skills = (skills or "") + word + ", "
+    # Clean up trailing spaces and commas
+    name = name.strip() if name else None
+    skills = skills.rstrip(", ") if skills else None
+    # Log the final parsed information
     print(f"Parsed Info: Name={name}, Email={email}, Skills={skills}, Experience={experience}")
     return {
     }
 def batch_process_resumes(pdf_files):
+    """Processes a batch of resume PDFs and outputs an Excel file."""
     all_resumes = []
     for pdf_file in pdf_files:
         resume_info = parse_resume(pdf_file)
         # Only add the parsed resume info if there's meaningful data
+        if any(resume_info.values()):
             all_resumes.append(resume_info)
+    # If no resumes are successfully parsed, return None
+    if not all_resumes:
+        print("No valid resume information was parsed.")
+        return None
     # Convert to DataFrame
     df = pd.DataFrame(all_resumes)
     # Define the file path for the Excel file
     output_file = "/tmp/parsed_resumes.xlsx"
         # Attempt to log in with provided token
         login_message = login_with_token(hf_token)
         if "Error" not in login_message:
+            # Process resumes and generate the download link
             excel_file_path = batch_process_resumes(pdf_files)
+            if excel_file_path:
+                return login_message + "\nExcel file with parsed resumes is ready for download.", excel_file_path
+            else:
+                return login_message + "\nNo valid resume information was parsed.", None
         else:
             return login_message, None