Spaces:

Prernas19
/

resume_parser

Sleeping

File size: 5,286 Bytes

import gradio as gr
import spacy
import re
import pdfplumber
import docx
import nltk
from nltk.corpus import words
from spacy.cli import download

# Download the SpaCy model if it doesn't exist
download("en_core_web_sm")
# Load the spaCy model
nlp = spacy.load("en_core_web_sm")


# Set of English words
nltk.download('words', quiet=True)
english_words = set(words.words())

def extract_text(file):
    try:
        if file.name.endswith('.pdf'):
            return extract_text_from_pdf(file)
        elif file.name.endswith('.docx'):
            return extract_text_from_docx(file)
        else:
            return "Unsupported file format"
    except Exception as e:
        return f"Error extracting text: {str(e)}"

def extract_text_from_pdf(file):
    text = ''
    with pdfplumber.open(file) as pdf:
        for page in pdf.pages:
            text += page.extract_text() or ''
    return text

def extract_text_from_docx(file):
    doc = docx.Document(file)
    return "\n".join([para.text for para in doc.paragraphs])

def extract_companies(text):
    # Process the text with the spaCy model
    doc = nlp(text)
    companies = []

    # Define a regex pattern for common company name suffixes
    company_pattern = re.compile(
        r'\b(?:Inc|Corp|LLC|Ltd|Co|Company|Group|Services|Technologies|Pvt|Solutions|Consulting)\b', re.IGNORECASE)

    # Iterate over the identified entities in the text
    for ent in doc.ents:
        if ent.label_ == "ORG":
            # Apply the regex pattern to filter out company names
            if company_pattern.search(ent.text):
                companies.append(ent.text)
    
    return companies

def extract_colleges(text):
    doc = nlp(text)
    colleges = []

    # Extended list of education-related keywords
    edu_keywords = ["university", "college", "institute", "school", "academy", "polytechnic", "faculty", "department", "center", "centre", "campus", "educational", "institute of technology"]

    for sent in doc.sents:
        # Extract entities labeled as ORG and check if they contain education-related keywords
        edu_ents = [ent for ent in sent.ents if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords)]
        for edu in edu_ents:
            colleges.append(edu.text)

    return colleges

def extract_years_of_experience(text):
    years = re.findall(r'(\d+)\s+year[s]*', text, re.IGNORECASE)
    months = re.findall(r'(\d+)\s+month[s]*', text, re.IGNORECASE)

    total_years = sum(map(int, years))
    total_months = sum(map(int, months))

    total_experience_years = total_years + (total_months // 12)
    total_experience_months = total_months % 12

    return f"{total_experience_years} years and {total_experience_months} months" if total_experience_years or total_experience_months else "Not available"

def extract_phone(text):
    phone_patterns = [
        r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b',
        r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
    ]
    for pattern in phone_patterns:
        match = re.search(pattern, text)
        if match:
            return match.group()
    return "Not found"

def extract_email(text):
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    match = re.search(email_pattern, text)
    return match.group() if match else "Not found"

def extract_summary(doc):
    sentences = list(doc.sents)
    summary = []
    for sent in sentences:
        if len(summary) >= 3:  # Limit to 3 sentences
            break
        if len(sent.text.split()) > 5 and sum(1 for word in sent.text.split() if word.lower() in english_words) / len(sent.text.split()) > 0.7:
            summary.append(sent.text)
    return " ".join(summary)

def extract_linkedin(text):
    linkedin_pattern = r'(?:https?:)?\/\/(?:[\w]+\.)?linkedin\.com\/in\/[A-z0-9_-]+\/?'
    match = re.search(linkedin_pattern, text)
    return match.group() if match else "Not found"

def parse_resume(file):
    try:
        text = extract_text(file)
        if text.startswith("Error") or text == "Unsupported file format":
            return {"Error": text}

        doc = nlp(text)

        companies = extract_companies(text)
        colleges = extract_colleges(text)
        years_of_experience = extract_years_of_experience(text)
        phone = extract_phone(text)
        email = extract_email(text)
        summary = extract_summary(doc)
        linkedin = extract_linkedin(text)

        result = {
            "Companies Worked For": companies,
            "Colleges Attended": colleges,
            "Years of Experience": years_of_experience,
            "Phone Number": phone,
            "Email ID": email,
            "Summary": summary,
            "LinkedIn ID": linkedin
        }

        return result
    except Exception as e:
        import traceback
        return {"Error": f"An error occurred while parsing the resume: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"}

# Create Gradio interface
iface = gr.Interface(
    fn=parse_resume,
    inputs=gr.File(label="Upload Resume (PDF or DOCX)"),
    outputs=gr.JSON(label="Extracted Information"),
    title="Advanced Resume Parser",
    description="Upload a resume in PDF or DOCX format to extract key information."
)

iface.launch(share=True)