Spaces:

Manojajj
/

bert-base-cased-resume_parser

Sleeping

File size: 4,781 Bytes

4eec766
75da080
4eec766
 
 
75da080
88324a0
4eec766
7ddce15
 
18b1dee
7ddce15
 
 
 
 
75da080
 
 
4eec766
 
18b1dee
 
 
 
 
 
 
 
 
 
4eec766
75da080
18b1dee
75da080
 
4eec766
18b1dee
 
 
b5578a6
75da080
 
4eec766
75da080
 
 
 
4eec766
18b1dee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5578a6
 
75da080
 
 
 
 
 
 
4eec766
 
75da080
18b1dee
75da080
4eec766
75da080
b5578a6
 
18b1dee
b5578a6
75da080
18b1dee
 
 
 
 
75da080
 
88324a0
 
 
 
75da080
88324a0
 
 
 
4eec766
75da080
 
 
7ddce15
 
 
 
 
75da080
7ddce15
 
75da080
7ddce15
88324a0
2a2cabb
88324a0
7ddce15
75da080
 
7ddce15
 
 
 
 
 
18b1dee
88324a0
18b1dee
 
 
 
7ddce15
88324a0
7ddce15
 
88324a0
4eec766
7ddce15
75da080

import gradio as gr
import pdfplumber
import torch
from transformers import pipeline
import pandas as pd
from huggingface_hub import login
import os

# Function to login using Hugging Face API token
def login_with_token(hf_token):
    """Login to Hugging Face using provided token."""
    try:
        login(token=hf_token)
        return "Logged in successfully!"
    except Exception as e:
        return f"Error: {str(e)}"

# Load the model for Named Entity Recognition (NER)
nlp = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", framework="pt")

def extract_text_from_pdf(pdf_file):
    """Extracts text from a PDF file using pdfplumber."""
    try:
        with pdfplumber.open(pdf_file.name) as pdf:
            text = ''
            for page in pdf.pages:
                text += page.extract_text() or ""  # Handle pages with no text
        return text
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return ""

def parse_resume(pdf_file):
    """Parses the resume and extracts relevant information."""
    # Extract text from PDF
    resume_text = extract_text_from_pdf(pdf_file)
    
    if not resume_text.strip():
        print("No text found in PDF.")
        return {}

    # Use the NER model to identify entities in the resume
    entities = nlp(resume_text)

    # Initialize empty fields
    name = email = phone = education = skills = experience = None

    # Example parsing logic based on NER output
    for entity in entities:
        label = entity.get("entity", "")
        word = entity.get("word", "").strip()

        if label == "B-PER" or label == "I-PER":
            name = (name or "") + word + " "
        elif label == "B-ORG" or label == "I-ORG":
            experience = (experience or "") + word + " "
        elif "@" in word:  # Simple email detection
            email = word
        elif label == "I-MISC":
            skills = (skills or "") + word + ", "

    # Clean up trailing spaces and commas
    name = name.strip() if name else None
    skills = skills.rstrip(", ") if skills else None

    # Log the final parsed information
    print(f"Parsed Info: Name={name}, Email={email}, Skills={skills}, Experience={experience}")

    return {
        'Name': name,
        'Email': email,
        'Phone': phone,
        'Education': education,
        'Skills': skills,
        'Experience': experience,
    }

def batch_process_resumes(pdf_files):
    """Processes a batch of resume PDFs and outputs an Excel file."""
    all_resumes = []
    for pdf_file in pdf_files:
        resume_info = parse_resume(pdf_file)
        
        # Only add the parsed resume info if there's meaningful data
        if any(resume_info.values()):
            all_resumes.append(resume_info)
    
    # If no resumes are successfully parsed, return None
    if not all_resumes:
        print("No valid resume information was parsed.")
        return None

    # Convert to DataFrame
    df = pd.DataFrame(all_resumes)
    
    # Define the file path for the Excel file
    output_file = "/tmp/parsed_resumes.xlsx"
    
    # Save to Excel
    df.to_excel(output_file, index=False)
    
    # Return the path to the file for download
    return output_file

# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("### AI Resume Parser")
    
    # User input for Hugging Face token
    hf_token_input = gr.Textbox(label="Hugging Face Token", placeholder="Enter your Hugging Face API Token here")
    
    # File input for resume files
    file_input = gr.File(file_count="multiple", label="Upload Resumes (PDFs)")
    
    # Output for results
    output = gr.Textbox(label="Result")
    
    # File output for the download link
    download_link = gr.File(label="Download Excel File", file_count="single")
    
    # Process button that triggers the login and resume parsing
    process_button = gr.Button("Process Resumes")
    
    # Function call when button is clicked
    def process_resumes(hf_token, pdf_files):
        # Attempt to log in with provided token
        login_message = login_with_token(hf_token)
        
        if "Error" not in login_message:
            # Process resumes and generate the download link
            excel_file_path = batch_process_resumes(pdf_files)
            if excel_file_path:
                return login_message + "\nExcel file with parsed resumes is ready for download.", excel_file_path
            else:
                return login_message + "\nNo valid resume information was parsed.", None
        else:
            return login_message, None

    # Set up the button click event
    process_button.click(process_resumes, inputs=[hf_token_input, file_input], outputs=[output, download_link])

# Launch the Gradio interface
demo.launch()