Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -4,11 +4,13 @@ import streamlit as st
|
|
4 |
import docx
|
5 |
import docx2txt
|
6 |
import tempfile
|
7 |
-
from transformers import pipeline
|
8 |
import numpy as np
|
9 |
from scipy.spatial.distance import cosine
|
10 |
import time
|
11 |
import re
|
|
|
|
|
|
|
12 |
|
13 |
# Set page title and hide sidebar
|
14 |
st.set_page_config(
|
@@ -25,18 +27,18 @@ st.markdown("""
|
|
25 |
""", unsafe_allow_html=True)
|
26 |
|
27 |
#####################################
|
28 |
-
# Preload Models
|
29 |
#####################################
|
30 |
@st.cache_resource(show_spinner=True)
|
31 |
def load_models():
|
32 |
-
"""Load models at startup"""
|
33 |
with st.spinner("Loading AI models... This may take a minute on first run."):
|
34 |
models = {}
|
35 |
-
# Load summarization model
|
36 |
-
models['summarizer'] = pipeline("summarization", model="
|
37 |
|
38 |
-
# Load feature extraction model for
|
39 |
-
models['feature_extractor'] = pipeline("feature-extraction", model="
|
40 |
|
41 |
return models
|
42 |
|
@@ -46,6 +48,7 @@ models = load_models()
|
|
46 |
#####################################
|
47 |
# Function: Extract Text from File
|
48 |
#####################################
|
|
|
49 |
def extract_text_from_file(file_obj):
|
50 |
"""
|
51 |
Extract text from .docx and .doc files.
|
@@ -88,73 +91,68 @@ def extract_text_from_file(file_obj):
|
|
88 |
return text
|
89 |
|
90 |
#####################################
|
91 |
-
# Functions for Information Extraction
|
92 |
#####################################
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
|
|
|
|
|
|
97 |
|
98 |
# Check first few non-empty lines for potential names
|
99 |
potential_name_lines = [line.strip() for line in lines[:5] if line.strip()]
|
100 |
|
101 |
if potential_name_lines:
|
102 |
-
# First line is often the name if it's short and doesn't contain common
|
103 |
first_line = potential_name_lines[0]
|
104 |
if 5 <= len(first_line) <= 40 and not any(x in first_line.lower() for x in ["resume", "cv", "curriculum", "vitae", "profile"]):
|
105 |
return first_line
|
106 |
|
107 |
-
# Look for lines that might contain a name
|
108 |
for line in potential_name_lines[:3]:
|
109 |
if len(line.split()) <= 4 and not any(x in line.lower() for x in ["address", "phone", "email", "resume", "cv"]):
|
110 |
return line
|
111 |
|
112 |
-
# If we couldn't find a clear name
|
113 |
return "Unknown (please extract from resume)"
|
114 |
|
115 |
def extract_age(text):
|
116 |
"""Extract candidate age from resume text"""
|
117 |
-
#
|
118 |
-
|
119 |
-
# Look for patterns like "Age: XX" or "XX years old"
|
120 |
age_patterns = [
|
121 |
r'age:?\s*(\d{1,2})',
|
122 |
r'(\d{1,2})\s*years\s*old',
|
123 |
-
r'DOB:?\s*(\d{1,2})[/-](\d{1,2})[/-](\d{2,4})'
|
124 |
]
|
125 |
|
|
|
126 |
for pattern in age_patterns:
|
127 |
-
matches = re.search(pattern,
|
128 |
if matches:
|
129 |
-
|
130 |
-
# Calculate age from DOB - simplified
|
131 |
-
return "Mentioned in DOB format"
|
132 |
-
else:
|
133 |
-
return matches.group(1)
|
134 |
|
135 |
return "Not specified"
|
136 |
|
137 |
-
def extract_industry(text,
|
138 |
"""Extract expected job industry from resume"""
|
139 |
-
#
|
140 |
industry_keywords = {
|
141 |
-
"technology": ["software", "programming", "developer", "IT", "tech", "computer"
|
142 |
-
"finance": ["banking", "
|
143 |
-
"healthcare": ["medical", "health", "hospital", "clinical", "nurse", "doctor"
|
144 |
-
"education": ["teaching", "teacher", "professor", "
|
145 |
-
"marketing": ["marketing", "advertising", "
|
146 |
-
"engineering": ["
|
147 |
-
"
|
148 |
-
"
|
149 |
-
"information systems": ["information systems", "ERP", "CRM", "database", "systems management"]
|
150 |
}
|
151 |
|
152 |
-
# Count occurrences of industry keywords
|
153 |
-
|
154 |
-
text_lower = text.lower()
|
155 |
|
|
|
156 |
for industry, keywords in industry_keywords.items():
|
157 |
-
counts[industry] = sum(
|
158 |
|
159 |
# Get the industry with the highest count
|
160 |
if counts:
|
@@ -163,229 +161,131 @@ def extract_industry(text, summary):
|
|
163 |
return likely_industry[0].capitalize()
|
164 |
|
165 |
# Check for educational background that might indicate industry
|
166 |
-
degrees = ["computer science", "business", "engineering", "medicine", "
|
167 |
-
"finance", "marketing", "information systems"]
|
168 |
|
169 |
for degree in degrees:
|
170 |
-
if degree in
|
171 |
return f"{degree.capitalize()}-related field"
|
172 |
|
173 |
-
return "Not clearly specified
|
174 |
|
175 |
-
def
|
176 |
-
"""Extract
|
177 |
-
# Common skill categories
|
178 |
skill_categories = {
|
179 |
-
"Programming": ["Python", "Java", "
|
180 |
-
|
181 |
-
"
|
182 |
-
|
183 |
-
|
184 |
-
"
|
185 |
-
|
186 |
-
"
|
187 |
-
"Frontend", "Backend", "Full-Stack", "Responsive Design"],
|
188 |
-
"Software Development": ["Agile", "Scrum", "Kanban", "Git", "CI/CD", "TDD", "OOP", "Design Patterns",
|
189 |
-
"Microservices", "DevOps", "Docker", "Kubernetes"],
|
190 |
-
"Cloud": ["AWS", "Azure", "Google Cloud", "Cloud Computing", "S3", "EC2", "Lambda", "Serverless",
|
191 |
-
"Cloud Architecture", "IaaS", "PaaS", "SaaS"],
|
192 |
-
"Business": ["Project Management", "Business Analysis", "Communication", "Teamwork", "Leadership",
|
193 |
-
"Strategy", "Negotiation", "Presentation", "Time Management"],
|
194 |
-
"Tools": ["Excel", "PowerPoint", "Tableau", "Power BI", "JIRA", "Confluence", "Slack", "Microsoft Office",
|
195 |
-
"Adobe", "Photoshop", "Salesforce"]
|
196 |
}
|
197 |
|
198 |
-
#
|
199 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
text_lower = text.lower()
|
201 |
|
|
|
|
|
202 |
for category, skills in skill_categories.items():
|
203 |
category_skills = []
|
204 |
for skill in skills:
|
205 |
-
# Check for case-insensitive match but preserve original case in output
|
206 |
if skill.lower() in text_lower:
|
207 |
category_skills.append(skill)
|
208 |
|
209 |
if category_skills:
|
210 |
found_skills.append(f"{category}: {', '.join(category_skills)}")
|
211 |
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
return "No specific technical skills clearly identified (review resume for details)"
|
216 |
-
|
217 |
-
def extract_work_experience(text):
|
218 |
-
"""Extract work experience from resume"""
|
219 |
-
# Common section headers for work experience
|
220 |
-
work_headers = [
|
221 |
-
"work experience", "professional experience", "employment history",
|
222 |
-
"work history", "experience", "professional background", "career history"
|
223 |
-
]
|
224 |
-
|
225 |
-
# Common section headers that might come after work experience
|
226 |
-
next_section_headers = [
|
227 |
-
"education", "skills", "certifications", "projects", "achievements",
|
228 |
-
"languages", "interests", "references", "additional information"
|
229 |
-
]
|
230 |
-
|
231 |
-
text_lower = text.lower()
|
232 |
-
lines = text.split('\n')
|
233 |
-
|
234 |
-
# Find the start of work experience section
|
235 |
-
work_start_idx = -1
|
236 |
-
work_header_used = ""
|
237 |
|
238 |
for idx, line in enumerate(lines):
|
239 |
line_lower = line.lower().strip()
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
for idx, line in enumerate(lines):
|
250 |
-
if re.search(date_pattern, line.lower()):
|
251 |
-
# Check surrounding lines for job titles or company names
|
252 |
-
context = " ".join(lines[max(0, idx-2):min(len(lines), idx+3)])
|
253 |
-
if any(title.lower() in context.lower() for title in ["manager", "developer", "engineer", "analyst", "assistant", "director", "coordinator"]):
|
254 |
-
work_start_idx = max(0, idx-2)
|
255 |
-
break
|
256 |
-
|
257 |
-
if work_start_idx == -1:
|
258 |
-
return "No clear work experience section found"
|
259 |
-
|
260 |
-
# Find the end of work experience section
|
261 |
-
work_end_idx = len(lines)
|
262 |
-
for idx in range(work_start_idx + 1, len(lines)):
|
263 |
-
line_lower = lines[idx].lower().strip()
|
264 |
-
if any(header in line_lower for header in next_section_headers):
|
265 |
-
if any(header == line_lower or header + ":" == line_lower for header in next_section_headers):
|
266 |
-
work_end_idx = idx
|
267 |
break
|
268 |
-
|
269 |
-
# Extract the work experience section
|
270 |
-
work_section = lines[work_start_idx + 1:work_end_idx]
|
271 |
-
|
272 |
-
# Process the work experience to make it more concise
|
273 |
-
# Look for companies, positions, dates, and key responsibilities
|
274 |
-
companies = []
|
275 |
-
current_company = {"name": "", "position": "", "dates": "", "description": []}
|
276 |
-
|
277 |
-
for line in work_section:
|
278 |
-
line = line.strip()
|
279 |
-
if not line:
|
280 |
-
continue
|
281 |
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
parts = re.split(r'(19|20)\d{2}', line, 1)
|
294 |
-
if len(parts) > 1:
|
295 |
-
current_company["position"] = parts[0].strip()
|
296 |
-
elif current_company["dates"] and not current_company["name"]:
|
297 |
-
# This line might be the company name or the continuation of position details
|
298 |
-
current_company["name"] = line
|
299 |
-
else:
|
300 |
-
# This is likely a responsibility or detail
|
301 |
-
current_company["description"].append(line)
|
302 |
-
|
303 |
-
# Add the last company if it exists
|
304 |
-
if current_company["name"] or current_company["position"]:
|
305 |
-
companies.append(current_company)
|
306 |
-
|
307 |
-
# Format the work experience
|
308 |
-
if not companies:
|
309 |
-
# Try a different approach - just extract text blocks that might be jobs
|
310 |
-
job_blocks = []
|
311 |
-
current_block = []
|
312 |
|
313 |
for line in work_section:
|
314 |
-
|
315 |
-
if
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
if job_blocks:
|
326 |
-
return "\n• " + "\n• ".join(job_blocks[:3]) # Limit to top 3 entries
|
327 |
-
else:
|
328 |
-
return "Work experience information could not be clearly structured"
|
329 |
-
|
330 |
-
# Format the companies into a readable output
|
331 |
-
formatted_experience = []
|
332 |
-
for company in companies[:3]: # Limit to top 3 most recent positions
|
333 |
-
entry = []
|
334 |
-
if company["position"]:
|
335 |
-
entry.append(f"**{company['position']}**")
|
336 |
-
if company["name"]:
|
337 |
-
entry.append(f"at {company['name']}")
|
338 |
-
if company["dates"]:
|
339 |
-
entry.append(f"({company['dates']})")
|
340 |
-
|
341 |
-
position_line = " ".join(entry)
|
342 |
|
343 |
-
if
|
344 |
-
# Limit to first 2-3 bullet points for conciseness
|
345 |
-
description = company["description"][:3]
|
346 |
-
description_text = "; ".join(description)
|
347 |
-
formatted_experience.append(f"{position_line} - {description_text}")
|
348 |
-
else:
|
349 |
-
formatted_experience.append(position_line)
|
350 |
|
351 |
-
if
|
352 |
-
|
353 |
-
|
354 |
-
return "Work experience information could not be clearly structured"
|
355 |
|
356 |
#####################################
|
357 |
-
# Function: Summarize Resume Text
|
358 |
#####################################
|
359 |
def summarize_resume_text(resume_text, models):
|
360 |
"""
|
361 |
-
Generates a structured summary of the resume text
|
362 |
-
expected job industry, skills, and work experience of the candidate.
|
363 |
"""
|
364 |
start_time = time.time()
|
365 |
|
366 |
summarizer = models['summarizer']
|
367 |
|
368 |
-
# First, generate a
|
369 |
max_input_length = 1024 # Model limit
|
370 |
|
371 |
-
|
372 |
-
|
373 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
374 |
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
else:
|
381 |
-
base_summary = summarizer(resume_text, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
|
382 |
-
|
383 |
-
# Extract specific information using custom extraction logic
|
384 |
-
name = extract_name(resume_text)
|
385 |
-
age = extract_age(resume_text)
|
386 |
-
industry = extract_industry(resume_text, base_summary)
|
387 |
-
skills = extract_skills(resume_text, base_summary)
|
388 |
-
work_experience = extract_work_experience(resume_text)
|
389 |
|
390 |
# Format the structured summary
|
391 |
formatted_summary = f"Name: {name}\n"
|
@@ -399,8 +299,9 @@ def summarize_resume_text(resume_text, models):
|
|
399 |
return formatted_summary, execution_time
|
400 |
|
401 |
#####################################
|
402 |
-
# Function: Compare Candidate Summary to Company Prompt
|
403 |
#####################################
|
|
|
404 |
def compute_suitability(candidate_summary, company_prompt, models):
|
405 |
"""
|
406 |
Compute the similarity between candidate summary and company prompt.
|
@@ -410,9 +311,13 @@ def compute_suitability(candidate_summary, company_prompt, models):
|
|
410 |
|
411 |
feature_extractor = models['feature_extractor']
|
412 |
|
413 |
-
# Extract features (embeddings)
|
414 |
-
|
415 |
-
|
|
|
|
|
|
|
|
|
416 |
|
417 |
# Convert to numpy arrays and flatten if needed
|
418 |
candidate_vec = np.mean(np.array(candidate_features[0]), axis=0)
|
@@ -426,7 +331,7 @@ def compute_suitability(candidate_summary, company_prompt, models):
|
|
426 |
return similarity, execution_time
|
427 |
|
428 |
#####################################
|
429 |
-
# Main Streamlit Interface
|
430 |
#####################################
|
431 |
st.title("Resume Analyzer and Company Suitability Checker")
|
432 |
st.markdown(
|
@@ -448,38 +353,49 @@ company_prompt = st.text_area(
|
|
448 |
help="Enter a detailed description of the company culture, role requirements, and desired skills.",
|
449 |
)
|
450 |
|
451 |
-
# Process button
|
452 |
if uploaded_file is not None and company_prompt and st.button("Analyze Resume"):
|
453 |
-
|
454 |
-
|
455 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
456 |
|
457 |
-
|
458 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
459 |
else:
|
460 |
-
|
461 |
-
summary, summarization_time = summarize_resume_text(resume_text, models)
|
462 |
-
|
463 |
-
# Display summary
|
464 |
-
st.subheader("Candidate Summary")
|
465 |
-
st.markdown(summary)
|
466 |
-
st.info(f"Summarization completed in {summarization_time:.2f} seconds")
|
467 |
-
|
468 |
-
# Only compute similarity if company description is provided
|
469 |
-
if company_prompt:
|
470 |
-
similarity_score, similarity_time = compute_suitability(summary, company_prompt, models)
|
471 |
-
|
472 |
-
# Display similarity score
|
473 |
-
st.subheader("Suitability Assessment")
|
474 |
-
st.markdown(f"**Matching Score:** {similarity_score:.2%}")
|
475 |
-
st.info(f"Similarity computation completed in {similarity_time:.2f} seconds")
|
476 |
-
|
477 |
-
# Provide interpretation
|
478 |
-
if similarity_score >= 0.85:
|
479 |
-
st.success("Excellent match! This candidate's profile is strongly aligned with the company requirements.")
|
480 |
-
elif similarity_score >= 0.70:
|
481 |
-
st.success("Good match! This candidate shows strong potential for the position.")
|
482 |
-
elif similarity_score >= 0.50:
|
483 |
-
st.warning("Moderate match. The candidate meets some requirements but there may be gaps.")
|
484 |
-
else:
|
485 |
-
st.error("Low match. The candidate's profile may not align well with the requirements.")
|
|
|
4 |
import docx
|
5 |
import docx2txt
|
6 |
import tempfile
|
|
|
7 |
import numpy as np
|
8 |
from scipy.spatial.distance import cosine
|
9 |
import time
|
10 |
import re
|
11 |
+
import concurrent.futures
|
12 |
+
from functools import lru_cache
|
13 |
+
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
|
14 |
|
15 |
# Set page title and hide sidebar
|
16 |
st.set_page_config(
|
|
|
27 |
""", unsafe_allow_html=True)
|
28 |
|
29 |
#####################################
|
30 |
+
# Preload Models - Optimized
|
31 |
#####################################
|
32 |
@st.cache_resource(show_spinner=True)
|
33 |
def load_models():
|
34 |
+
"""Load models at startup - using smaller/faster models"""
|
35 |
with st.spinner("Loading AI models... This may take a minute on first run."):
|
36 |
models = {}
|
37 |
+
# Load smaller summarization model for speed
|
38 |
+
models['summarizer'] = pipeline("summarization", model="facebook/bart-large-cnn", max_length=130)
|
39 |
|
40 |
+
# Load smaller feature extraction model for speed
|
41 |
+
models['feature_extractor'] = pipeline("feature-extraction", model="distilbert-base-uncased")
|
42 |
|
43 |
return models
|
44 |
|
|
|
48 |
#####################################
|
49 |
# Function: Extract Text from File
|
50 |
#####################################
|
51 |
+
@st.cache_data(show_spinner=False)
|
52 |
def extract_text_from_file(file_obj):
|
53 |
"""
|
54 |
Extract text from .docx and .doc files.
|
|
|
91 |
return text
|
92 |
|
93 |
#####################################
|
94 |
+
# Functions for Information Extraction - Optimized
|
95 |
#####################################
|
96 |
+
|
97 |
+
# Cache the extraction functions to avoid reprocessing
|
98 |
+
@lru_cache(maxsize=32)
|
99 |
+
def extract_name(text_start):
|
100 |
+
"""Extract candidate name from the beginning of resume text"""
|
101 |
+
# Only use the first 500 characters to speed up processing
|
102 |
+
lines = text_start.split('\n')
|
103 |
|
104 |
# Check first few non-empty lines for potential names
|
105 |
potential_name_lines = [line.strip() for line in lines[:5] if line.strip()]
|
106 |
|
107 |
if potential_name_lines:
|
108 |
+
# First line is often the name if it's short and doesn't contain common headers
|
109 |
first_line = potential_name_lines[0]
|
110 |
if 5 <= len(first_line) <= 40 and not any(x in first_line.lower() for x in ["resume", "cv", "curriculum", "vitae", "profile"]):
|
111 |
return first_line
|
112 |
|
113 |
+
# Look for lines that might contain a name
|
114 |
for line in potential_name_lines[:3]:
|
115 |
if len(line.split()) <= 4 and not any(x in line.lower() for x in ["address", "phone", "email", "resume", "cv"]):
|
116 |
return line
|
117 |
|
|
|
118 |
return "Unknown (please extract from resume)"
|
119 |
|
120 |
def extract_age(text):
|
121 |
"""Extract candidate age from resume text"""
|
122 |
+
# Simplified: just check a few common patterns
|
|
|
|
|
123 |
age_patterns = [
|
124 |
r'age:?\s*(\d{1,2})',
|
125 |
r'(\d{1,2})\s*years\s*old',
|
|
|
126 |
]
|
127 |
|
128 |
+
text_lower = text.lower()
|
129 |
for pattern in age_patterns:
|
130 |
+
matches = re.search(pattern, text_lower)
|
131 |
if matches:
|
132 |
+
return matches.group(1)
|
|
|
|
|
|
|
|
|
133 |
|
134 |
return "Not specified"
|
135 |
|
136 |
+
def extract_industry(text, base_summary):
|
137 |
"""Extract expected job industry from resume"""
|
138 |
+
# Simplified industry keywords focused on the most common ones
|
139 |
industry_keywords = {
|
140 |
+
"technology": ["software", "programming", "developer", "IT", "tech", "computer"],
|
141 |
+
"finance": ["banking", "financial", "accounting", "finance", "analyst"],
|
142 |
+
"healthcare": ["medical", "health", "hospital", "clinical", "nurse", "doctor"],
|
143 |
+
"education": ["teaching", "teacher", "professor", "education", "university"],
|
144 |
+
"marketing": ["marketing", "advertising", "digital marketing", "social media"],
|
145 |
+
"engineering": ["engineer", "engineering"],
|
146 |
+
"data science": ["data science", "machine learning", "AI", "analytics"],
|
147 |
+
"information systems": ["information systems", "ERP", "systems management"]
|
|
|
148 |
}
|
149 |
|
150 |
+
# Count occurrences of industry keywords - using the summary to speed up
|
151 |
+
combined_text = base_summary.lower()
|
|
|
152 |
|
153 |
+
counts = {}
|
154 |
for industry, keywords in industry_keywords.items():
|
155 |
+
counts[industry] = sum(combined_text.count(keyword.lower()) for keyword in keywords)
|
156 |
|
157 |
# Get the industry with the highest count
|
158 |
if counts:
|
|
|
161 |
return likely_industry[0].capitalize()
|
162 |
|
163 |
# Check for educational background that might indicate industry
|
164 |
+
degrees = ["computer science", "business", "engineering", "medicine", "education", "finance", "marketing"]
|
|
|
165 |
|
166 |
for degree in degrees:
|
167 |
+
if degree in combined_text:
|
168 |
return f"{degree.capitalize()}-related field"
|
169 |
|
170 |
+
return "Not clearly specified"
|
171 |
|
172 |
+
def extract_skills_and_work(text):
|
173 |
+
"""Extract both skills and work experience at once to save processing time"""
|
174 |
+
# Common skill categories - reduced keyword list for speed
|
175 |
skill_categories = {
|
176 |
+
"Programming": ["Python", "Java", "JavaScript", "HTML", "CSS", "SQL", "C++", "C#"],
|
177 |
+
"Data Science": ["Machine Learning", "Data Analysis", "Statistics", "TensorFlow", "PyTorch"],
|
178 |
+
"Database": ["SQL", "MySQL", "MongoDB", "Database"],
|
179 |
+
"Web Development": ["React", "Angular", "Node.js", "Frontend", "Backend"],
|
180 |
+
"Software Development": ["Agile", "Scrum", "Git", "DevOps", "Docker"],
|
181 |
+
"Cloud": ["AWS", "Azure", "Google Cloud", "Cloud"],
|
182 |
+
"Business": ["Project Management", "Business Analysis", "Leadership"],
|
183 |
+
"Tools": ["Excel", "PowerPoint", "Tableau", "Power BI", "JIRA"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
}
|
185 |
|
186 |
+
# Work experience extraction
|
187 |
+
work_headers = [
|
188 |
+
"work experience", "professional experience", "employment history",
|
189 |
+
"work history", "experience"
|
190 |
+
]
|
191 |
+
|
192 |
+
next_section_headers = [
|
193 |
+
"education", "skills", "certifications", "projects", "achievements"
|
194 |
+
]
|
195 |
+
|
196 |
+
# Process everything at once
|
197 |
+
lines = text.split('\n')
|
198 |
text_lower = text.lower()
|
199 |
|
200 |
+
# Skills extraction
|
201 |
+
found_skills = []
|
202 |
for category, skills in skill_categories.items():
|
203 |
category_skills = []
|
204 |
for skill in skills:
|
|
|
205 |
if skill.lower() in text_lower:
|
206 |
category_skills.append(skill)
|
207 |
|
208 |
if category_skills:
|
209 |
found_skills.append(f"{category}: {', '.join(category_skills)}")
|
210 |
|
211 |
+
# Work experience extraction - simplified approach
|
212 |
+
work_section = []
|
213 |
+
in_work_section = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
|
215 |
for idx, line in enumerate(lines):
|
216 |
line_lower = line.lower().strip()
|
217 |
+
|
218 |
+
# Start of work section
|
219 |
+
if not in_work_section:
|
220 |
+
if any(header in line_lower for header in work_headers):
|
221 |
+
in_work_section = True
|
222 |
+
continue
|
223 |
+
# End of work section
|
224 |
+
elif in_work_section:
|
225 |
+
if any(header in line_lower for header in next_section_headers):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
226 |
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
|
228 |
+
if line.strip():
|
229 |
+
work_section.append(line.strip())
|
230 |
+
|
231 |
+
# Simplified work formatting
|
232 |
+
if not work_section:
|
233 |
+
work_experience = "Work experience not clearly identified"
|
234 |
+
else:
|
235 |
+
# Just take the first 5-7 lines of the work section as a summary
|
236 |
+
work_lines = []
|
237 |
+
company_count = 0
|
238 |
+
current_company = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
239 |
|
240 |
for line in work_section:
|
241 |
+
# New company entry often has a date
|
242 |
+
if re.search(r'(19|20)\d{2}', line):
|
243 |
+
company_count += 1
|
244 |
+
if company_count <= 3: # Limit to 3 most recent positions
|
245 |
+
current_company = line
|
246 |
+
work_lines.append(f"**{line}**")
|
247 |
+
else:
|
248 |
+
break
|
249 |
+
elif company_count <= 3 and len(work_lines) < 10: # Limit total lines
|
250 |
+
work_lines.append(line)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
251 |
|
252 |
+
work_experience = "\n• " + "\n• ".join(work_lines[:7]) if work_lines else "Work experience not clearly structured"
|
|
|
|
|
|
|
|
|
|
|
|
|
253 |
|
254 |
+
skills_formatted = "\n• " + "\n• ".join(found_skills) if found_skills else "No specific technical skills clearly identified"
|
255 |
+
|
256 |
+
return skills_formatted, work_experience
|
|
|
257 |
|
258 |
#####################################
|
259 |
+
# Function: Summarize Resume Text - Optimized
|
260 |
#####################################
|
261 |
def summarize_resume_text(resume_text, models):
|
262 |
"""
|
263 |
+
Generates a structured summary of the resume text - optimized for speed
|
|
|
264 |
"""
|
265 |
start_time = time.time()
|
266 |
|
267 |
summarizer = models['summarizer']
|
268 |
|
269 |
+
# First, generate a quick summary
|
270 |
max_input_length = 1024 # Model limit
|
271 |
|
272 |
+
# Only summarize the first portion of text for speed
|
273 |
+
text_to_summarize = resume_text[:min(len(resume_text), max_input_length)]
|
274 |
+
base_summary = summarizer(text_to_summarize)[0]['summary_text']
|
275 |
+
|
276 |
+
# Extract information in parallel where possible
|
277 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
278 |
+
# These can run in parallel
|
279 |
+
name_future = executor.submit(extract_name, resume_text[:500]) # Only use start of text
|
280 |
+
age_future = executor.submit(extract_age, resume_text)
|
281 |
+
industry_future = executor.submit(extract_industry, resume_text, base_summary)
|
282 |
+
skills_work_future = executor.submit(extract_skills_and_work, resume_text)
|
283 |
|
284 |
+
# Get results
|
285 |
+
name = name_future.result()
|
286 |
+
age = age_future.result()
|
287 |
+
industry = industry_future.result()
|
288 |
+
skills, work_experience = skills_work_future.result()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
289 |
|
290 |
# Format the structured summary
|
291 |
formatted_summary = f"Name: {name}\n"
|
|
|
299 |
return formatted_summary, execution_time
|
300 |
|
301 |
#####################################
|
302 |
+
# Function: Compare Candidate Summary to Company Prompt - Optimized
|
303 |
#####################################
|
304 |
+
@st.cache_data(show_spinner=False)
|
305 |
def compute_suitability(candidate_summary, company_prompt, models):
|
306 |
"""
|
307 |
Compute the similarity between candidate summary and company prompt.
|
|
|
311 |
|
312 |
feature_extractor = models['feature_extractor']
|
313 |
|
314 |
+
# Extract features (embeddings) - parallelize this
|
315 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
316 |
+
candidate_future = executor.submit(feature_extractor, candidate_summary)
|
317 |
+
company_future = executor.submit(feature_extractor, company_prompt)
|
318 |
+
|
319 |
+
candidate_features = candidate_future.result()
|
320 |
+
company_features = company_future.result()
|
321 |
|
322 |
# Convert to numpy arrays and flatten if needed
|
323 |
candidate_vec = np.mean(np.array(candidate_features[0]), axis=0)
|
|
|
331 |
return similarity, execution_time
|
332 |
|
333 |
#####################################
|
334 |
+
# Main Streamlit Interface - with Progress Reporting
|
335 |
#####################################
|
336 |
st.title("Resume Analyzer and Company Suitability Checker")
|
337 |
st.markdown(
|
|
|
353 |
help="Enter a detailed description of the company culture, role requirements, and desired skills.",
|
354 |
)
|
355 |
|
356 |
+
# Process button with optimized flow
|
357 |
if uploaded_file is not None and company_prompt and st.button("Analyze Resume"):
|
358 |
+
# Create a placeholder for the progress bar
|
359 |
+
progress_bar = st.progress(0)
|
360 |
+
status_text = st.empty()
|
361 |
+
|
362 |
+
# Step 1: Extract text
|
363 |
+
status_text.text("Step 1/3: Extracting text from resume...")
|
364 |
+
resume_text = extract_text_from_file(uploaded_file)
|
365 |
+
progress_bar.progress(25)
|
366 |
+
|
367 |
+
if resume_text.startswith("Error") or resume_text == "Unsupported file type. Please upload a .docx, .doc, or .txt file.":
|
368 |
+
st.error(resume_text)
|
369 |
+
else:
|
370 |
+
# Step 2: Generate summary
|
371 |
+
status_text.text("Step 2/3: Analyzing resume and generating summary...")
|
372 |
+
summary, summarization_time = summarize_resume_text(resume_text, models)
|
373 |
+
progress_bar.progress(75)
|
374 |
+
|
375 |
+
# Display summary
|
376 |
+
st.subheader("Candidate Summary")
|
377 |
+
st.markdown(summary)
|
378 |
+
st.info(f"Summary generated in {summarization_time:.2f} seconds")
|
379 |
+
|
380 |
+
# Step 3: Compute similarity
|
381 |
+
status_text.text("Step 3/3: Calculating compatibility with company profile...")
|
382 |
+
similarity_score, similarity_time = compute_suitability(summary, company_prompt, models)
|
383 |
+
progress_bar.progress(100)
|
384 |
+
|
385 |
+
# Clear status messages
|
386 |
+
status_text.empty()
|
387 |
|
388 |
+
# Display similarity score
|
389 |
+
st.subheader("Suitability Assessment")
|
390 |
+
st.markdown(f"**Matching Score:** {similarity_score:.2%}")
|
391 |
+
st.info(f"Compatibility assessment completed in {similarity_time:.2f} seconds")
|
392 |
+
|
393 |
+
# Provide interpretation
|
394 |
+
if similarity_score >= 0.85:
|
395 |
+
st.success("Excellent match! This candidate's profile is strongly aligned with the company requirements.")
|
396 |
+
elif similarity_score >= 0.70:
|
397 |
+
st.success("Good match! This candidate shows strong potential for the position.")
|
398 |
+
elif similarity_score >= 0.50:
|
399 |
+
st.warning("Moderate match. The candidate meets some requirements but there may be gaps.")
|
400 |
else:
|
401 |
+
st.error("Low match. The candidate's profile may not align well with the requirements.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|