Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import spacy
|
3 |
+
import re
|
4 |
+
import pdfplumber
|
5 |
+
import docx
|
6 |
+
import nltk
|
7 |
+
from nltk.corpus import words
|
8 |
+
|
9 |
+
# Load the spaCy model
|
10 |
+
nlp = spacy.load("en_core_web_sm")
|
11 |
+
|
12 |
+
# Set of English words
|
13 |
+
nltk.download('words', quiet=True)
|
14 |
+
english_words = set(words.words())
|
15 |
+
|
16 |
+
def extract_text(file):
|
17 |
+
try:
|
18 |
+
if file.name.endswith('.pdf'):
|
19 |
+
return extract_text_from_pdf(file)
|
20 |
+
elif file.name.endswith('.docx'):
|
21 |
+
return extract_text_from_docx(file)
|
22 |
+
else:
|
23 |
+
return "Unsupported file format"
|
24 |
+
except Exception as e:
|
25 |
+
return f"Error extracting text: {str(e)}"
|
26 |
+
|
27 |
+
def extract_text_from_pdf(file):
|
28 |
+
text = ''
|
29 |
+
with pdfplumber.open(file) as pdf:
|
30 |
+
for page in pdf.pages:
|
31 |
+
text += page.extract_text() or ''
|
32 |
+
return text
|
33 |
+
|
34 |
+
def extract_text_from_docx(file):
|
35 |
+
doc = docx.Document(file)
|
36 |
+
return "\n".join([para.text for para in doc.paragraphs])
|
37 |
+
|
38 |
+
def extract_companies(text):
|
39 |
+
doc = nlp(text)
|
40 |
+
companies = []
|
41 |
+
|
42 |
+
company_pattern = re.compile(
|
43 |
+
r'\b(?:Inc\.|Corp\.|LLC|Ltd\.|Co\.|Company|Group|Services|Technologies|Pvt\.|Solutions|Consulting|Associates|Enterprises|Partners|Holdings|Systems|Networks|Ventures|Partners|International|Ltd|GmbH|S\.A\.|S\.L\.|LLP|PLC|AG|LLC)\b', re.IGNORECASE)
|
44 |
+
|
45 |
+
for ent in doc.ents:
|
46 |
+
if ent.label_ == "ORG" and company_pattern.search(ent.text):
|
47 |
+
companies.append(ent.text)
|
48 |
+
|
49 |
+
# Join companies with new lines
|
50 |
+
return "\n".join(companies)
|
51 |
+
|
52 |
+
def extract_colleges(text):
|
53 |
+
doc = nlp(text)
|
54 |
+
colleges = []
|
55 |
+
|
56 |
+
edu_keywords = ["university", "college", "institute", "school", "academy", "polytechnic", "faculty", "department", "center", "centre", "campus", "educational", "institute of technology"]
|
57 |
+
|
58 |
+
for sent in doc.sents:
|
59 |
+
edu_ents = [ent for ent in sent.ents if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords)]
|
60 |
+
for edu in edu_ents:
|
61 |
+
colleges.append(edu.text)
|
62 |
+
|
63 |
+
# Join colleges with new lines
|
64 |
+
return "\n".join(colleges)
|
65 |
+
|
66 |
+
def extract_years_of_experience(text):
|
67 |
+
years = re.findall(r'(\d+)\s+year[s]*', text, re.IGNORECASE)
|
68 |
+
months = re.findall(r'(\d+)\s+month[s]*', text, re.IGNORECASE)
|
69 |
+
|
70 |
+
total_years = sum(map(int, years))
|
71 |
+
total_months = sum(map(int, months))
|
72 |
+
|
73 |
+
total_experience_years = total_years + (total_months // 12)
|
74 |
+
total_experience_months = total_months % 12
|
75 |
+
|
76 |
+
return f"{total_experience_years} years and {total_experience_months} months" if total_experience_years or total_experience_months else "Not available"
|
77 |
+
|
78 |
+
def extract_phone(text):
|
79 |
+
phone_patterns = [
|
80 |
+
r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b',
|
81 |
+
r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
|
82 |
+
]
|
83 |
+
for pattern in phone_patterns:
|
84 |
+
match = re.search(pattern, text)
|
85 |
+
if match:
|
86 |
+
return match.group()
|
87 |
+
return "Not found"
|
88 |
+
|
89 |
+
def extract_email(text):
|
90 |
+
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
|
91 |
+
match = re.search(email_pattern, text)
|
92 |
+
return match.group() if match else "Not found"
|
93 |
+
|
94 |
+
def extract_summary(doc):
|
95 |
+
sentences = list(doc.sents)
|
96 |
+
summary = []
|
97 |
+
for sent in sentences:
|
98 |
+
if len(summary) >= 3: # Limit to 3 sentences
|
99 |
+
break
|
100 |
+
if len(sent.text.split()) > 5 and sum(1 for word in sent.text.split() if word.lower() in english_words) / len(sent.text.split()) > 0.7:
|
101 |
+
summary.append(sent.text)
|
102 |
+
return " ".join(summary)
|
103 |
+
|
104 |
+
def extract_linkedin(text):
|
105 |
+
linkedin_pattern = r'(?:https?:)?\/\/(?:[\w]+\.)?linkedin\.com\/in\/[A-z0-9_-]+\/?'
|
106 |
+
match = re.search(linkedin_pattern, text)
|
107 |
+
return match.group() if match else "Not found"
|
108 |
+
|
109 |
+
def parse_resume(file):
|
110 |
+
try:
|
111 |
+
text = extract_text(file)
|
112 |
+
if text.startswith("Error") or text == "Unsupported file format":
|
113 |
+
return {"Error": text}
|
114 |
+
|
115 |
+
doc = nlp(text)
|
116 |
+
|
117 |
+
companies = extract_companies(text)
|
118 |
+
colleges = extract_colleges(text)
|
119 |
+
years_of_experience = extract_years_of_experience(text)
|
120 |
+
phone = extract_phone(text)
|
121 |
+
email = extract_email(text)
|
122 |
+
summary = extract_summary(doc)
|
123 |
+
linkedin = extract_linkedin(text)
|
124 |
+
|
125 |
+
return companies, colleges, years_of_experience, phone, email, summary, linkedin
|
126 |
+
|
127 |
+
except Exception as e:
|
128 |
+
import traceback
|
129 |
+
return f"An error occurred while parsing the resume: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
|
130 |
+
|
131 |
+
# Create Gradio interface with separate output components
|
132 |
+
iface = gr.Interface(
|
133 |
+
fn=parse_resume,
|
134 |
+
inputs=gr.File(label="Upload Resume (PDF or DOCX)"),
|
135 |
+
outputs=[
|
136 |
+
gr.Textbox(label="Companies Worked For", lines=10),
|
137 |
+
gr.Textbox(label="Colleges Attended", lines=10),
|
138 |
+
gr.Textbox(label="Years of Experience"),
|
139 |
+
gr.Textbox(label="Phone Number"),
|
140 |
+
gr.Textbox(label="Email ID"),
|
141 |
+
gr.Textbox(label="Summary", lines=3),
|
142 |
+
gr.Textbox(label="LinkedIn ID")
|
143 |
+
],
|
144 |
+
title="Advanced Resume Parser",
|
145 |
+
description="Upload a resume in PDF or DOCX format to extract key information."
|
146 |
+
)
|
147 |
+
|
148 |
+
iface.launch(share=True)
|