Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -8,6 +8,7 @@ from transformers import pipeline
|
|
8 |
import numpy as np
|
9 |
from scipy.spatial.distance import cosine
|
10 |
import time
|
|
|
11 |
|
12 |
# Set page title and hide sidebar
|
13 |
st.set_page_config(
|
@@ -86,6 +87,133 @@ def extract_text_from_file(file_obj):
|
|
86 |
text = "Unsupported file type. Please upload a .docx, .doc, or .txt file."
|
87 |
return text
|
88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
#####################################
|
90 |
# Function: Summarize Resume Text
|
91 |
#####################################
|
@@ -98,44 +226,32 @@ def summarize_resume_text(resume_text, models):
|
|
98 |
|
99 |
summarizer = models['summarizer']
|
100 |
|
101 |
-
#
|
102 |
max_input_length = 1024 # Model limit
|
103 |
|
104 |
-
# Append instructions to guide the model to extract structured information
|
105 |
-
prompt = f"Summarize this resume and include the candidate's name, age, expected job industry, and skills: {resume_text[:max_input_length]}"
|
106 |
-
|
107 |
if len(resume_text) > max_input_length:
|
108 |
-
# Process in chunks if text is too long
|
109 |
chunks = [resume_text[i:i+max_input_length] for i in range(0, min(len(resume_text), 3*max_input_length), max_input_length)]
|
110 |
summaries = []
|
111 |
|
112 |
for chunk in chunks:
|
113 |
-
chunk_summary = summarizer(chunk, max_length=
|
114 |
summaries.append(chunk_summary)
|
115 |
|
116 |
-
|
117 |
-
if len(candidate_summary) > max_input_length:
|
118 |
-
candidate_summary = summarizer(f"Provide name, age, expected job industry, and skills of the candidate: {candidate_summary[:max_input_length]}",
|
119 |
-
max_length=150, min_length=40, do_sample=False)[0]['summary_text']
|
120 |
else:
|
121 |
-
|
122 |
-
|
123 |
-
# Format the summary to ensure it contains the required information
|
124 |
-
# If the model doesn't extract all required information, we'll add placeholders
|
125 |
-
formatted_summary = candidate_summary
|
126 |
-
|
127 |
-
# Check if the summary contains the required information and add labels if needed
|
128 |
-
if "name:" not in formatted_summary.lower() and "name " not in formatted_summary.lower():
|
129 |
-
formatted_summary = "Name: [Not explicitly mentioned in resume]\n" + formatted_summary
|
130 |
-
|
131 |
-
if "age:" not in formatted_summary.lower() and "age " not in formatted_summary.lower():
|
132 |
-
formatted_summary += "\nAge: [Not explicitly mentioned in resume]"
|
133 |
|
134 |
-
|
135 |
-
|
|
|
|
|
|
|
136 |
|
137 |
-
|
138 |
-
|
|
|
|
|
|
|
139 |
|
140 |
execution_time = time.time() - start_time
|
141 |
|
@@ -176,7 +292,7 @@ st.markdown(
|
|
176 |
"""
|
177 |
Upload your resume file in **.docx**, **.doc**, or **.txt** format. The app performs the following tasks:
|
178 |
1. Extracts text from the resume.
|
179 |
-
2. Uses
|
180 |
3. Compares the candidate summary with a company profile to produce a suitability score.
|
181 |
"""
|
182 |
)
|
|
|
8 |
import numpy as np
|
9 |
from scipy.spatial.distance import cosine
|
10 |
import time
|
11 |
+
import re
|
12 |
|
13 |
# Set page title and hide sidebar
|
14 |
st.set_page_config(
|
|
|
87 |
text = "Unsupported file type. Please upload a .docx, .doc, or .txt file."
|
88 |
return text
|
89 |
|
90 |
+
#####################################
|
91 |
+
# Functions for Information Extraction
|
92 |
+
#####################################
|
93 |
+
def extract_name(text):
|
94 |
+
"""Extract candidate name from resume text"""
|
95 |
+
# Look for common name patterns at the beginning of resumes
|
96 |
+
lines = text.split('\n')
|
97 |
+
|
98 |
+
# Check first few non-empty lines for potential names
|
99 |
+
potential_name_lines = [line.strip() for line in lines[:5] if line.strip()]
|
100 |
+
|
101 |
+
if potential_name_lines:
|
102 |
+
# First line is often the name if it's short and doesn't contain common resume headers
|
103 |
+
first_line = potential_name_lines[0]
|
104 |
+
if 5 <= len(first_line) <= 40 and not any(x in first_line.lower() for x in ["resume", "cv", "curriculum", "vitae", "profile"]):
|
105 |
+
return first_line
|
106 |
+
|
107 |
+
# Look for lines that might contain a name (not containing common keywords)
|
108 |
+
for line in potential_name_lines[:3]:
|
109 |
+
if len(line.split()) <= 4 and not any(x in line.lower() for x in ["address", "phone", "email", "resume", "cv"]):
|
110 |
+
return line
|
111 |
+
|
112 |
+
# If we couldn't find a clear name
|
113 |
+
return "Unknown (please extract from resume)"
|
114 |
+
|
115 |
+
def extract_age(text):
|
116 |
+
"""Extract candidate age from resume text"""
|
117 |
+
# Look for common age patterns
|
118 |
+
|
119 |
+
# Look for patterns like "Age: XX" or "XX years old"
|
120 |
+
age_patterns = [
|
121 |
+
r'age:?\s*(\d{1,2})',
|
122 |
+
r'(\d{1,2})\s*years\s*old',
|
123 |
+
r'DOB:?\s*(\d{1,2})[/-](\d{1,2})[/-](\d{2,4})'
|
124 |
+
]
|
125 |
+
|
126 |
+
for pattern in age_patterns:
|
127 |
+
matches = re.search(pattern, text.lower())
|
128 |
+
if matches:
|
129 |
+
if pattern == age_patterns[2]: # DOB pattern
|
130 |
+
# Calculate age from DOB - simplified
|
131 |
+
return "Mentioned in DOB format"
|
132 |
+
else:
|
133 |
+
return matches.group(1)
|
134 |
+
|
135 |
+
return "Not specified"
|
136 |
+
|
137 |
+
def extract_industry(text, summary):
|
138 |
+
"""Extract expected job industry from resume"""
|
139 |
+
# Look for industry-related keywords
|
140 |
+
industry_keywords = {
|
141 |
+
"technology": ["software", "programming", "developer", "IT", "tech", "computer", "web", "data science"],
|
142 |
+
"finance": ["banking", "investment", "financial", "accounting", "finance", "analyst"],
|
143 |
+
"healthcare": ["medical", "health", "hospital", "clinical", "nurse", "doctor", "patient"],
|
144 |
+
"education": ["teaching", "teacher", "professor", "academic", "education", "school", "university"],
|
145 |
+
"marketing": ["marketing", "advertising", "brand", "digital marketing", "SEO", "social media"],
|
146 |
+
"engineering": ["mechanical", "civil", "electrical", "engineer", "engineering"],
|
147 |
+
"consulting": ["consultant", "consulting", "advisory"],
|
148 |
+
"data science": ["data science", "machine learning", "AI", "analytics", "big data"],
|
149 |
+
"information systems": ["information systems", "ERP", "CRM", "database", "systems management"]
|
150 |
+
}
|
151 |
+
|
152 |
+
# Count occurrences of industry keywords
|
153 |
+
counts = {}
|
154 |
+
text_lower = text.lower()
|
155 |
+
|
156 |
+
for industry, keywords in industry_keywords.items():
|
157 |
+
counts[industry] = sum(text_lower.count(keyword.lower()) for keyword in keywords)
|
158 |
+
|
159 |
+
# Get the industry with the highest count
|
160 |
+
if counts:
|
161 |
+
likely_industry = max(counts.items(), key=lambda x: x[1])
|
162 |
+
if likely_industry[1] > 0:
|
163 |
+
return likely_industry[0].capitalize()
|
164 |
+
|
165 |
+
# Check for educational background that might indicate industry
|
166 |
+
degrees = ["computer science", "business", "engineering", "medicine", "law", "education",
|
167 |
+
"finance", "marketing", "information systems"]
|
168 |
+
|
169 |
+
for degree in degrees:
|
170 |
+
if degree in text_lower:
|
171 |
+
return f"{degree.capitalize()}-related field"
|
172 |
+
|
173 |
+
return "Not clearly specified (review resume for details)"
|
174 |
+
|
175 |
+
def extract_skills(text, summary):
|
176 |
+
"""Extract key skills from resume"""
|
177 |
+
# Common skill categories and associated keywords
|
178 |
+
skill_categories = {
|
179 |
+
"Programming": ["Python", "Java", "C++", "JavaScript", "HTML", "CSS", "SQL", "R", "C#", "PHP",
|
180 |
+
"Ruby", "Swift", "TypeScript", "Go", "Scala", "Kotlin", "Rust"],
|
181 |
+
"Data Science": ["Machine Learning", "Deep Learning", "NLP", "Data Analysis", "Statistics",
|
182 |
+
"Big Data", "Data Visualization", "TensorFlow", "PyTorch", "Neural Networks",
|
183 |
+
"Regression", "Classification", "Clustering"],
|
184 |
+
"Database": ["SQL", "MySQL", "PostgreSQL", "MongoDB", "Oracle", "SQLite", "NoSQL", "Database Design",
|
185 |
+
"Data Modeling", "ETL", "Data Warehousing"],
|
186 |
+
"Web Development": ["React", "Angular", "Vue.js", "Node.js", "Django", "Flask", "Express", "RESTful API",
|
187 |
+
"Frontend", "Backend", "Full-Stack", "Responsive Design"],
|
188 |
+
"Software Development": ["Agile", "Scrum", "Kanban", "Git", "CI/CD", "TDD", "OOP", "Design Patterns",
|
189 |
+
"Microservices", "DevOps", "Docker", "Kubernetes"],
|
190 |
+
"Cloud": ["AWS", "Azure", "Google Cloud", "Cloud Computing", "S3", "EC2", "Lambda", "Serverless",
|
191 |
+
"Cloud Architecture", "IaaS", "PaaS", "SaaS"],
|
192 |
+
"Business": ["Project Management", "Business Analysis", "Communication", "Teamwork", "Leadership",
|
193 |
+
"Strategy", "Negotiation", "Presentation", "Time Management"],
|
194 |
+
"Tools": ["Excel", "PowerPoint", "Tableau", "Power BI", "JIRA", "Confluence", "Slack", "Microsoft Office",
|
195 |
+
"Adobe", "Photoshop", "Salesforce"]
|
196 |
+
}
|
197 |
+
|
198 |
+
# Find skills mentioned in the resume
|
199 |
+
found_skills = []
|
200 |
+
text_lower = text.lower()
|
201 |
+
|
202 |
+
for category, skills in skill_categories.items():
|
203 |
+
category_skills = []
|
204 |
+
for skill in skills:
|
205 |
+
# Check for case-insensitive match but preserve original case in output
|
206 |
+
if skill.lower() in text_lower:
|
207 |
+
category_skills.append(skill)
|
208 |
+
|
209 |
+
if category_skills:
|
210 |
+
found_skills.append(f"{category}: {', '.join(category_skills)}")
|
211 |
+
|
212 |
+
if found_skills:
|
213 |
+
return "\n• " + "\n• ".join(found_skills)
|
214 |
+
else:
|
215 |
+
return "No specific technical skills clearly identified (review resume for details)"
|
216 |
+
|
217 |
#####################################
|
218 |
# Function: Summarize Resume Text
|
219 |
#####################################
|
|
|
226 |
|
227 |
summarizer = models['summarizer']
|
228 |
|
229 |
+
# First, generate a general summary
|
230 |
max_input_length = 1024 # Model limit
|
231 |
|
|
|
|
|
|
|
232 |
if len(resume_text) > max_input_length:
|
|
|
233 |
chunks = [resume_text[i:i+max_input_length] for i in range(0, min(len(resume_text), 3*max_input_length), max_input_length)]
|
234 |
summaries = []
|
235 |
|
236 |
for chunk in chunks:
|
237 |
+
chunk_summary = summarizer(chunk, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
|
238 |
summaries.append(chunk_summary)
|
239 |
|
240 |
+
base_summary = " ".join(summaries)
|
|
|
|
|
|
|
241 |
else:
|
242 |
+
base_summary = summarizer(resume_text, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
243 |
|
244 |
+
# Extract specific information using custom extraction logic
|
245 |
+
name = extract_name(resume_text)
|
246 |
+
age = extract_age(resume_text)
|
247 |
+
industry = extract_industry(resume_text, base_summary)
|
248 |
+
skills = extract_skills(resume_text, base_summary)
|
249 |
|
250 |
+
# Format the structured summary
|
251 |
+
formatted_summary = f"Name: {name}\n"
|
252 |
+
formatted_summary += f"Age: {age}\n"
|
253 |
+
formatted_summary += f"Expected Job Industry: {industry}\n"
|
254 |
+
formatted_summary += f"Skills: {skills}"
|
255 |
|
256 |
execution_time = time.time() - start_time
|
257 |
|
|
|
292 |
"""
|
293 |
Upload your resume file in **.docx**, **.doc**, or **.txt** format. The app performs the following tasks:
|
294 |
1. Extracts text from the resume.
|
295 |
+
2. Uses AI to generate a structured candidate summary with name, age, expected job industry, and skills.
|
296 |
3. Compares the candidate summary with a company profile to produce a suitability score.
|
297 |
"""
|
298 |
)
|