File size: 5,075 Bytes
66dea93
 
 
 
 
 
 
7eb897b
66dea93
7eb897b
 
66dea93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6116745
66dea93
 
51593d8
 
66dea93
51593d8
 
66dea93
 
 
 
 
 
 
 
 
 
 
51593d8
 
 
66dea93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51593d8
 
66dea93
 
51593d8
66dea93
51593d8
66dea93
 
 
51593d8
 
 
 
 
 
 
 
 
66dea93
 
 
 
51593d8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import gradio as gr
import spacy
import re
import pdfplumber
import docx
import nltk
from nltk.corpus import words
from spacy.cli import download

# Download the SpaCy model if it doesn't exist
download("en_core_web_sm")
# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Set of English words
nltk.download('words', quiet=True)
english_words = set(words.words())

def extract_text(file):
    try:
        if file.name.endswith('.pdf'):
            return extract_text_from_pdf(file)
        elif file.name.endswith('.docx'):
            return extract_text_from_docx(file)
        else:
            return "Unsupported file format"
    except Exception as e:
        return f"Error extracting text: {str(e)}"

def extract_text_from_pdf(file):
    text = ''
    with pdfplumber.open(file) as pdf:
        for page in pdf.pages:
            text += page.extract_text() or ''
    return text

def extract_text_from_docx(file):
    doc = docx.Document(file)
    return "\n".join([para.text for para in doc.paragraphs])

def extract_companies(text):
    doc = nlp(text)
    companies = []

    company_pattern = re.compile(
        r'\b(?:Inc|Corp|LLC|Ltd|Co|Company|Group|Services|Technologies|Pvt|Solutions|Consulting)\b', re.IGNORECASE)

    for ent in doc.ents:
        if ent.label_ == "ORG" and company_pattern.search(ent.text):
            companies.append(ent.text)
    
    # Join companies with new lines
    return "\n".join(companies)

def extract_colleges(text):
    doc = nlp(text)
    colleges = []

    edu_keywords = ["university", "college", "institute", "school", "academy", "polytechnic", "faculty", "department", "center", "centre", "campus", "educational", "institute of technology"]

    for sent in doc.sents:
        edu_ents = [ent for ent in sent.ents if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords)]
        for edu in edu_ents:
            colleges.append(edu.text)
    
    # Join colleges with new lines
    return "\n".join(colleges)

def extract_years_of_experience(text):
    years = re.findall(r'(\d+)\s+year[s]*', text, re.IGNORECASE)
    months = re.findall(r'(\d+)\s+month[s]*', text, re.IGNORECASE)

    total_years = sum(map(int, years))
    total_months = sum(map(int, months))

    total_experience_years = total_years + (total_months // 12)
    total_experience_months = total_months % 12

    return f"{total_experience_years} years and {total_experience_months} months" if total_experience_years or total_experience_months else "Not available"

def extract_phone(text):
    phone_patterns = [
        r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b',
        r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
    ]
    for pattern in phone_patterns:
        match = re.search(pattern, text)
        if match:
            return match.group()
    return "Not found"

def extract_email(text):
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    match = re.search(email_pattern, text)
    return match.group() if match else "Not found"

def extract_summary(doc):
    sentences = list(doc.sents)
    summary = []
    for sent in sentences:
        if len(summary) >= 3:  # Limit to 3 sentences
            break
        if len(sent.text.split()) > 5 and sum(1 for word in sent.text.split() if word.lower() in english_words) / len(sent.text.split()) > 0.7:
            summary.append(sent.text)
    return " ".join(summary)

def extract_linkedin(text):
    linkedin_pattern = r'(?:https?:)?\/\/(?:[\w]+\.)?linkedin\.com\/in\/[A-z0-9_-]+\/?'
    match = re.search(linkedin_pattern, text)
    return match.group() if match else "Not found"

def parse_resume(file):
    try:
        text = extract_text(file)
        if text.startswith("Error") or text == "Unsupported file format":
            return {"Error": text}

        doc = nlp(text)

        companies = extract_companies(text)
        colleges = extract_colleges(text)
        years_of_experience = extract_years_of_experience(text)
        phone = extract_phone(text)
        email = extract_email(text)
        summary = extract_summary(doc)
        linkedin = extract_linkedin(text)

        return companies, colleges, years_of_experience, phone, email, summary, linkedin

    except Exception as e:
        import traceback
        return f"An error occurred while parsing the resume: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"

# Create Gradio interface with separate output components
iface = gr.Interface(
    fn=parse_resume,
    inputs=gr.File(label="Upload Resume (PDF or DOCX)"),
    outputs=[
        gr.Textbox(label="Companies Worked For", lines=10),
        gr.Textbox(label="Colleges Attended", lines=10),
        gr.Textbox(label="Years of Experience"),
        gr.Textbox(label="Phone Number"),
        gr.Textbox(label="Email ID"),
        gr.Textbox(label="Summary", lines=3),
        gr.Textbox(label="LinkedIn ID")
    ],
    title="Advanced Resume Parser",
    description="Upload a resume in PDF or DOCX format to extract key information."
)

iface.launch(share=True)