CR7CAD commited on
Commit
e1a5956
·
verified ·
1 Parent(s): d204788

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +175 -259
app.py CHANGED
@@ -4,11 +4,13 @@ import streamlit as st
4
  import docx
5
  import docx2txt
6
  import tempfile
7
- from transformers import pipeline
8
  import numpy as np
9
  from scipy.spatial.distance import cosine
10
  import time
11
  import re
 
 
 
12
 
13
  # Set page title and hide sidebar
14
  st.set_page_config(
@@ -25,18 +27,18 @@ st.markdown("""
25
  """, unsafe_allow_html=True)
26
 
27
  #####################################
28
- # Preload Models
29
  #####################################
30
  @st.cache_resource(show_spinner=True)
31
  def load_models():
32
- """Load models at startup"""
33
  with st.spinner("Loading AI models... This may take a minute on first run."):
34
  models = {}
35
- # Load summarization model
36
- models['summarizer'] = pipeline("summarization", model="t5-base")
37
 
38
- # Load feature extraction model for similarity
39
- models['feature_extractor'] = pipeline("feature-extraction", model="bert-base-uncased")
40
 
41
  return models
42
 
@@ -46,6 +48,7 @@ models = load_models()
46
  #####################################
47
  # Function: Extract Text from File
48
  #####################################
 
49
  def extract_text_from_file(file_obj):
50
  """
51
  Extract text from .docx and .doc files.
@@ -88,73 +91,68 @@ def extract_text_from_file(file_obj):
88
  return text
89
 
90
  #####################################
91
- # Functions for Information Extraction
92
  #####################################
93
- def extract_name(text):
94
- """Extract candidate name from resume text"""
95
- # Look for common name patterns at the beginning of resumes
96
- lines = text.split('\n')
 
 
 
97
 
98
  # Check first few non-empty lines for potential names
99
  potential_name_lines = [line.strip() for line in lines[:5] if line.strip()]
100
 
101
  if potential_name_lines:
102
- # First line is often the name if it's short and doesn't contain common resume headers
103
  first_line = potential_name_lines[0]
104
  if 5 <= len(first_line) <= 40 and not any(x in first_line.lower() for x in ["resume", "cv", "curriculum", "vitae", "profile"]):
105
  return first_line
106
 
107
- # Look for lines that might contain a name (not containing common keywords)
108
  for line in potential_name_lines[:3]:
109
  if len(line.split()) <= 4 and not any(x in line.lower() for x in ["address", "phone", "email", "resume", "cv"]):
110
  return line
111
 
112
- # If we couldn't find a clear name
113
  return "Unknown (please extract from resume)"
114
 
115
  def extract_age(text):
116
  """Extract candidate age from resume text"""
117
- # Look for common age patterns
118
-
119
- # Look for patterns like "Age: XX" or "XX years old"
120
  age_patterns = [
121
  r'age:?\s*(\d{1,2})',
122
  r'(\d{1,2})\s*years\s*old',
123
- r'DOB:?\s*(\d{1,2})[/-](\d{1,2})[/-](\d{2,4})'
124
  ]
125
 
 
126
  for pattern in age_patterns:
127
- matches = re.search(pattern, text.lower())
128
  if matches:
129
- if pattern == age_patterns[2]: # DOB pattern
130
- # Calculate age from DOB - simplified
131
- return "Mentioned in DOB format"
132
- else:
133
- return matches.group(1)
134
 
135
  return "Not specified"
136
 
137
- def extract_industry(text, summary):
138
  """Extract expected job industry from resume"""
139
- # Look for industry-related keywords
140
  industry_keywords = {
141
- "technology": ["software", "programming", "developer", "IT", "tech", "computer", "web", "data science"],
142
- "finance": ["banking", "investment", "financial", "accounting", "finance", "analyst"],
143
- "healthcare": ["medical", "health", "hospital", "clinical", "nurse", "doctor", "patient"],
144
- "education": ["teaching", "teacher", "professor", "academic", "education", "school", "university"],
145
- "marketing": ["marketing", "advertising", "brand", "digital marketing", "SEO", "social media"],
146
- "engineering": ["mechanical", "civil", "electrical", "engineer", "engineering"],
147
- "consulting": ["consultant", "consulting", "advisory"],
148
- "data science": ["data science", "machine learning", "AI", "analytics", "big data"],
149
- "information systems": ["information systems", "ERP", "CRM", "database", "systems management"]
150
  }
151
 
152
- # Count occurrences of industry keywords
153
- counts = {}
154
- text_lower = text.lower()
155
 
 
156
  for industry, keywords in industry_keywords.items():
157
- counts[industry] = sum(text_lower.count(keyword.lower()) for keyword in keywords)
158
 
159
  # Get the industry with the highest count
160
  if counts:
@@ -163,229 +161,131 @@ def extract_industry(text, summary):
163
  return likely_industry[0].capitalize()
164
 
165
  # Check for educational background that might indicate industry
166
- degrees = ["computer science", "business", "engineering", "medicine", "law", "education",
167
- "finance", "marketing", "information systems"]
168
 
169
  for degree in degrees:
170
- if degree in text_lower:
171
  return f"{degree.capitalize()}-related field"
172
 
173
- return "Not clearly specified (review resume for details)"
174
 
175
- def extract_skills(text, summary):
176
- """Extract key skills from resume"""
177
- # Common skill categories and associated keywords
178
  skill_categories = {
179
- "Programming": ["Python", "Java", "C++", "JavaScript", "HTML", "CSS", "SQL", "R", "C#", "PHP",
180
- "Ruby", "Swift", "TypeScript", "Go", "Scala", "Kotlin", "Rust"],
181
- "Data Science": ["Machine Learning", "Deep Learning", "NLP", "Data Analysis", "Statistics",
182
- "Big Data", "Data Visualization", "TensorFlow", "PyTorch", "Neural Networks",
183
- "Regression", "Classification", "Clustering"],
184
- "Database": ["SQL", "MySQL", "PostgreSQL", "MongoDB", "Oracle", "SQLite", "NoSQL", "Database Design",
185
- "Data Modeling", "ETL", "Data Warehousing"],
186
- "Web Development": ["React", "Angular", "Vue.js", "Node.js", "Django", "Flask", "Express", "RESTful API",
187
- "Frontend", "Backend", "Full-Stack", "Responsive Design"],
188
- "Software Development": ["Agile", "Scrum", "Kanban", "Git", "CI/CD", "TDD", "OOP", "Design Patterns",
189
- "Microservices", "DevOps", "Docker", "Kubernetes"],
190
- "Cloud": ["AWS", "Azure", "Google Cloud", "Cloud Computing", "S3", "EC2", "Lambda", "Serverless",
191
- "Cloud Architecture", "IaaS", "PaaS", "SaaS"],
192
- "Business": ["Project Management", "Business Analysis", "Communication", "Teamwork", "Leadership",
193
- "Strategy", "Negotiation", "Presentation", "Time Management"],
194
- "Tools": ["Excel", "PowerPoint", "Tableau", "Power BI", "JIRA", "Confluence", "Slack", "Microsoft Office",
195
- "Adobe", "Photoshop", "Salesforce"]
196
  }
197
 
198
- # Find skills mentioned in the resume
199
- found_skills = []
 
 
 
 
 
 
 
 
 
 
200
  text_lower = text.lower()
201
 
 
 
202
  for category, skills in skill_categories.items():
203
  category_skills = []
204
  for skill in skills:
205
- # Check for case-insensitive match but preserve original case in output
206
  if skill.lower() in text_lower:
207
  category_skills.append(skill)
208
 
209
  if category_skills:
210
  found_skills.append(f"{category}: {', '.join(category_skills)}")
211
 
212
- if found_skills:
213
- return "\n• " + "\n• ".join(found_skills)
214
- else:
215
- return "No specific technical skills clearly identified (review resume for details)"
216
-
217
- def extract_work_experience(text):
218
- """Extract work experience from resume"""
219
- # Common section headers for work experience
220
- work_headers = [
221
- "work experience", "professional experience", "employment history",
222
- "work history", "experience", "professional background", "career history"
223
- ]
224
-
225
- # Common section headers that might come after work experience
226
- next_section_headers = [
227
- "education", "skills", "certifications", "projects", "achievements",
228
- "languages", "interests", "references", "additional information"
229
- ]
230
-
231
- text_lower = text.lower()
232
- lines = text.split('\n')
233
-
234
- # Find the start of work experience section
235
- work_start_idx = -1
236
- work_header_used = ""
237
 
238
  for idx, line in enumerate(lines):
239
  line_lower = line.lower().strip()
240
- if any(header in line_lower for header in work_headers):
241
- if any(header == line_lower or header + ":" == line_lower for header in work_headers):
242
- work_start_idx = idx
243
- work_header_used = line.strip()
244
- break
245
-
246
- if work_start_idx == -1:
247
- # Try to find work experience by looking for date patterns (common in resumes)
248
- date_pattern = r'(19|20)\d{2}\s*(-|–|to)\s*(19|20)\d{2}|present|current|now'
249
- for idx, line in enumerate(lines):
250
- if re.search(date_pattern, line.lower()):
251
- # Check surrounding lines for job titles or company names
252
- context = " ".join(lines[max(0, idx-2):min(len(lines), idx+3)])
253
- if any(title.lower() in context.lower() for title in ["manager", "developer", "engineer", "analyst", "assistant", "director", "coordinator"]):
254
- work_start_idx = max(0, idx-2)
255
- break
256
-
257
- if work_start_idx == -1:
258
- return "No clear work experience section found"
259
-
260
- # Find the end of work experience section
261
- work_end_idx = len(lines)
262
- for idx in range(work_start_idx + 1, len(lines)):
263
- line_lower = lines[idx].lower().strip()
264
- if any(header in line_lower for header in next_section_headers):
265
- if any(header == line_lower or header + ":" == line_lower for header in next_section_headers):
266
- work_end_idx = idx
267
  break
268
-
269
- # Extract the work experience section
270
- work_section = lines[work_start_idx + 1:work_end_idx]
271
-
272
- # Process the work experience to make it more concise
273
- # Look for companies, positions, dates, and key responsibilities
274
- companies = []
275
- current_company = {"name": "", "position": "", "dates": "", "description": []}
276
-
277
- for line in work_section:
278
- line = line.strip()
279
- if not line:
280
- continue
281
 
282
- # Check if this is likely a new company/position entry
283
- if re.search(r'(19|20)\d{2}\s*(-|–|to)\s*(19|20)\d{2}|present|current|now', line.lower()):
284
- # Save previous company if it exists
285
- if current_company["name"] or current_company["position"]:
286
- companies.append(current_company)
287
- current_company = {"name": "", "position": "", "dates": "", "description": []}
288
-
289
- # This line likely contains position/company and dates
290
- current_company["dates"] = line
291
-
292
- # Try to extract position and company
293
- parts = re.split(r'(19|20)\d{2}', line, 1)
294
- if len(parts) > 1:
295
- current_company["position"] = parts[0].strip()
296
- elif current_company["dates"] and not current_company["name"]:
297
- # This line might be the company name or the continuation of position details
298
- current_company["name"] = line
299
- else:
300
- # This is likely a responsibility or detail
301
- current_company["description"].append(line)
302
-
303
- # Add the last company if it exists
304
- if current_company["name"] or current_company["position"]:
305
- companies.append(current_company)
306
-
307
- # Format the work experience
308
- if not companies:
309
- # Try a different approach - just extract text blocks that might be jobs
310
- job_blocks = []
311
- current_block = []
312
 
313
  for line in work_section:
314
- line = line.strip()
315
- if not line:
316
- if current_block:
317
- job_blocks.append(" ".join(current_block))
318
- current_block = []
319
- else:
320
- current_block.append(line)
321
-
322
- if current_block:
323
- job_blocks.append(" ".join(current_block))
324
-
325
- if job_blocks:
326
- return "\n• " + "\n• ".join(job_blocks[:3]) # Limit to top 3 entries
327
- else:
328
- return "Work experience information could not be clearly structured"
329
-
330
- # Format the companies into a readable output
331
- formatted_experience = []
332
- for company in companies[:3]: # Limit to top 3 most recent positions
333
- entry = []
334
- if company["position"]:
335
- entry.append(f"**{company['position']}**")
336
- if company["name"]:
337
- entry.append(f"at {company['name']}")
338
- if company["dates"]:
339
- entry.append(f"({company['dates']})")
340
-
341
- position_line = " ".join(entry)
342
 
343
- if company["description"]:
344
- # Limit to first 2-3 bullet points for conciseness
345
- description = company["description"][:3]
346
- description_text = "; ".join(description)
347
- formatted_experience.append(f"{position_line} - {description_text}")
348
- else:
349
- formatted_experience.append(position_line)
350
 
351
- if formatted_experience:
352
- return "\n• " + "\n• ".join(formatted_experience)
353
- else:
354
- return "Work experience information could not be clearly structured"
355
 
356
  #####################################
357
- # Function: Summarize Resume Text
358
  #####################################
359
  def summarize_resume_text(resume_text, models):
360
  """
361
- Generates a structured summary of the resume text including name, age,
362
- expected job industry, skills, and work experience of the candidate.
363
  """
364
  start_time = time.time()
365
 
366
  summarizer = models['summarizer']
367
 
368
- # First, generate a general summary
369
  max_input_length = 1024 # Model limit
370
 
371
- if len(resume_text) > max_input_length:
372
- chunks = [resume_text[i:i+max_input_length] for i in range(0, min(len(resume_text), 3*max_input_length), max_input_length)]
373
- summaries = []
 
 
 
 
 
 
 
 
374
 
375
- for chunk in chunks:
376
- chunk_summary = summarizer(chunk, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
377
- summaries.append(chunk_summary)
378
-
379
- base_summary = " ".join(summaries)
380
- else:
381
- base_summary = summarizer(resume_text, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
382
-
383
- # Extract specific information using custom extraction logic
384
- name = extract_name(resume_text)
385
- age = extract_age(resume_text)
386
- industry = extract_industry(resume_text, base_summary)
387
- skills = extract_skills(resume_text, base_summary)
388
- work_experience = extract_work_experience(resume_text)
389
 
390
  # Format the structured summary
391
  formatted_summary = f"Name: {name}\n"
@@ -399,8 +299,9 @@ def summarize_resume_text(resume_text, models):
399
  return formatted_summary, execution_time
400
 
401
  #####################################
402
- # Function: Compare Candidate Summary to Company Prompt
403
  #####################################
 
404
  def compute_suitability(candidate_summary, company_prompt, models):
405
  """
406
  Compute the similarity between candidate summary and company prompt.
@@ -410,9 +311,13 @@ def compute_suitability(candidate_summary, company_prompt, models):
410
 
411
  feature_extractor = models['feature_extractor']
412
 
413
- # Extract features (embeddings)
414
- candidate_features = feature_extractor(candidate_summary)
415
- company_features = feature_extractor(company_prompt)
 
 
 
 
416
 
417
  # Convert to numpy arrays and flatten if needed
418
  candidate_vec = np.mean(np.array(candidate_features[0]), axis=0)
@@ -426,7 +331,7 @@ def compute_suitability(candidate_summary, company_prompt, models):
426
  return similarity, execution_time
427
 
428
  #####################################
429
- # Main Streamlit Interface
430
  #####################################
431
  st.title("Resume Analyzer and Company Suitability Checker")
432
  st.markdown(
@@ -448,38 +353,49 @@ company_prompt = st.text_area(
448
  help="Enter a detailed description of the company culture, role requirements, and desired skills.",
449
  )
450
 
451
- # Process button
452
  if uploaded_file is not None and company_prompt and st.button("Analyze Resume"):
453
- with st.spinner("Processing..."):
454
- # Extract text from resume
455
- resume_text = extract_text_from_file(uploaded_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
456
 
457
- if resume_text.startswith("Error") or resume_text == "Unsupported file type. Please upload a .docx, .doc, or .txt file.":
458
- st.error(resume_text)
 
 
 
 
 
 
 
 
 
 
459
  else:
460
- # Generate summary
461
- summary, summarization_time = summarize_resume_text(resume_text, models)
462
-
463
- # Display summary
464
- st.subheader("Candidate Summary")
465
- st.markdown(summary)
466
- st.info(f"Summarization completed in {summarization_time:.2f} seconds")
467
-
468
- # Only compute similarity if company description is provided
469
- if company_prompt:
470
- similarity_score, similarity_time = compute_suitability(summary, company_prompt, models)
471
-
472
- # Display similarity score
473
- st.subheader("Suitability Assessment")
474
- st.markdown(f"**Matching Score:** {similarity_score:.2%}")
475
- st.info(f"Similarity computation completed in {similarity_time:.2f} seconds")
476
-
477
- # Provide interpretation
478
- if similarity_score >= 0.85:
479
- st.success("Excellent match! This candidate's profile is strongly aligned with the company requirements.")
480
- elif similarity_score >= 0.70:
481
- st.success("Good match! This candidate shows strong potential for the position.")
482
- elif similarity_score >= 0.50:
483
- st.warning("Moderate match. The candidate meets some requirements but there may be gaps.")
484
- else:
485
- st.error("Low match. The candidate's profile may not align well with the requirements.")
 
4
  import docx
5
  import docx2txt
6
  import tempfile
 
7
  import numpy as np
8
  from scipy.spatial.distance import cosine
9
  import time
10
  import re
11
+ import concurrent.futures
12
+ from functools import lru_cache
13
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
14
 
15
  # Set page title and hide sidebar
16
  st.set_page_config(
 
27
  """, unsafe_allow_html=True)
28
 
29
  #####################################
30
+ # Preload Models - Optimized
31
  #####################################
32
  @st.cache_resource(show_spinner=True)
33
  def load_models():
34
+ """Load models at startup - using smaller/faster models"""
35
  with st.spinner("Loading AI models... This may take a minute on first run."):
36
  models = {}
37
+ # Load smaller summarization model for speed
38
+ models['summarizer'] = pipeline("summarization", model="facebook/bart-large-cnn", max_length=130)
39
 
40
+ # Load smaller feature extraction model for speed
41
+ models['feature_extractor'] = pipeline("feature-extraction", model="distilbert-base-uncased")
42
 
43
  return models
44
 
 
48
  #####################################
49
  # Function: Extract Text from File
50
  #####################################
51
+ @st.cache_data(show_spinner=False)
52
  def extract_text_from_file(file_obj):
53
  """
54
  Extract text from .docx and .doc files.
 
91
  return text
92
 
93
  #####################################
94
+ # Functions for Information Extraction - Optimized
95
  #####################################
96
+
97
+ # Cache the extraction functions to avoid reprocessing
98
+ @lru_cache(maxsize=32)
99
+ def extract_name(text_start):
100
+ """Extract candidate name from the beginning of resume text"""
101
+ # Only use the first 500 characters to speed up processing
102
+ lines = text_start.split('\n')
103
 
104
  # Check first few non-empty lines for potential names
105
  potential_name_lines = [line.strip() for line in lines[:5] if line.strip()]
106
 
107
  if potential_name_lines:
108
+ # First line is often the name if it's short and doesn't contain common headers
109
  first_line = potential_name_lines[0]
110
  if 5 <= len(first_line) <= 40 and not any(x in first_line.lower() for x in ["resume", "cv", "curriculum", "vitae", "profile"]):
111
  return first_line
112
 
113
+ # Look for lines that might contain a name
114
  for line in potential_name_lines[:3]:
115
  if len(line.split()) <= 4 and not any(x in line.lower() for x in ["address", "phone", "email", "resume", "cv"]):
116
  return line
117
 
 
118
  return "Unknown (please extract from resume)"
119
 
120
  def extract_age(text):
121
  """Extract candidate age from resume text"""
122
+ # Simplified: just check a few common patterns
 
 
123
  age_patterns = [
124
  r'age:?\s*(\d{1,2})',
125
  r'(\d{1,2})\s*years\s*old',
 
126
  ]
127
 
128
+ text_lower = text.lower()
129
  for pattern in age_patterns:
130
+ matches = re.search(pattern, text_lower)
131
  if matches:
132
+ return matches.group(1)
 
 
 
 
133
 
134
  return "Not specified"
135
 
136
+ def extract_industry(text, base_summary):
137
  """Extract expected job industry from resume"""
138
+ # Simplified industry keywords focused on the most common ones
139
  industry_keywords = {
140
+ "technology": ["software", "programming", "developer", "IT", "tech", "computer"],
141
+ "finance": ["banking", "financial", "accounting", "finance", "analyst"],
142
+ "healthcare": ["medical", "health", "hospital", "clinical", "nurse", "doctor"],
143
+ "education": ["teaching", "teacher", "professor", "education", "university"],
144
+ "marketing": ["marketing", "advertising", "digital marketing", "social media"],
145
+ "engineering": ["engineer", "engineering"],
146
+ "data science": ["data science", "machine learning", "AI", "analytics"],
147
+ "information systems": ["information systems", "ERP", "systems management"]
 
148
  }
149
 
150
+ # Count occurrences of industry keywords - using the summary to speed up
151
+ combined_text = base_summary.lower()
 
152
 
153
+ counts = {}
154
  for industry, keywords in industry_keywords.items():
155
+ counts[industry] = sum(combined_text.count(keyword.lower()) for keyword in keywords)
156
 
157
  # Get the industry with the highest count
158
  if counts:
 
161
  return likely_industry[0].capitalize()
162
 
163
  # Check for educational background that might indicate industry
164
+ degrees = ["computer science", "business", "engineering", "medicine", "education", "finance", "marketing"]
 
165
 
166
  for degree in degrees:
167
+ if degree in combined_text:
168
  return f"{degree.capitalize()}-related field"
169
 
170
+ return "Not clearly specified"
171
 
172
+ def extract_skills_and_work(text):
173
+ """Extract both skills and work experience at once to save processing time"""
174
+ # Common skill categories - reduced keyword list for speed
175
  skill_categories = {
176
+ "Programming": ["Python", "Java", "JavaScript", "HTML", "CSS", "SQL", "C++", "C#"],
177
+ "Data Science": ["Machine Learning", "Data Analysis", "Statistics", "TensorFlow", "PyTorch"],
178
+ "Database": ["SQL", "MySQL", "MongoDB", "Database"],
179
+ "Web Development": ["React", "Angular", "Node.js", "Frontend", "Backend"],
180
+ "Software Development": ["Agile", "Scrum", "Git", "DevOps", "Docker"],
181
+ "Cloud": ["AWS", "Azure", "Google Cloud", "Cloud"],
182
+ "Business": ["Project Management", "Business Analysis", "Leadership"],
183
+ "Tools": ["Excel", "PowerPoint", "Tableau", "Power BI", "JIRA"]
 
 
 
 
 
 
 
 
 
184
  }
185
 
186
+ # Work experience extraction
187
+ work_headers = [
188
+ "work experience", "professional experience", "employment history",
189
+ "work history", "experience"
190
+ ]
191
+
192
+ next_section_headers = [
193
+ "education", "skills", "certifications", "projects", "achievements"
194
+ ]
195
+
196
+ # Process everything at once
197
+ lines = text.split('\n')
198
  text_lower = text.lower()
199
 
200
+ # Skills extraction
201
+ found_skills = []
202
  for category, skills in skill_categories.items():
203
  category_skills = []
204
  for skill in skills:
 
205
  if skill.lower() in text_lower:
206
  category_skills.append(skill)
207
 
208
  if category_skills:
209
  found_skills.append(f"{category}: {', '.join(category_skills)}")
210
 
211
+ # Work experience extraction - simplified approach
212
+ work_section = []
213
+ in_work_section = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
 
215
  for idx, line in enumerate(lines):
216
  line_lower = line.lower().strip()
217
+
218
+ # Start of work section
219
+ if not in_work_section:
220
+ if any(header in line_lower for header in work_headers):
221
+ in_work_section = True
222
+ continue
223
+ # End of work section
224
+ elif in_work_section:
225
+ if any(header in line_lower for header in next_section_headers):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
  break
 
 
 
 
 
 
 
 
 
 
 
 
 
227
 
228
+ if line.strip():
229
+ work_section.append(line.strip())
230
+
231
+ # Simplified work formatting
232
+ if not work_section:
233
+ work_experience = "Work experience not clearly identified"
234
+ else:
235
+ # Just take the first 5-7 lines of the work section as a summary
236
+ work_lines = []
237
+ company_count = 0
238
+ current_company = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
 
240
  for line in work_section:
241
+ # New company entry often has a date
242
+ if re.search(r'(19|20)\d{2}', line):
243
+ company_count += 1
244
+ if company_count <= 3: # Limit to 3 most recent positions
245
+ current_company = line
246
+ work_lines.append(f"**{line}**")
247
+ else:
248
+ break
249
+ elif company_count <= 3 and len(work_lines) < 10: # Limit total lines
250
+ work_lines.append(line)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
+ work_experience = "\n• " + "\n• ".join(work_lines[:7]) if work_lines else "Work experience not clearly structured"
 
 
 
 
 
 
253
 
254
+ skills_formatted = "\n• " + "\n• ".join(found_skills) if found_skills else "No specific technical skills clearly identified"
255
+
256
+ return skills_formatted, work_experience
 
257
 
258
  #####################################
259
+ # Function: Summarize Resume Text - Optimized
260
  #####################################
261
  def summarize_resume_text(resume_text, models):
262
  """
263
+ Generates a structured summary of the resume text - optimized for speed
 
264
  """
265
  start_time = time.time()
266
 
267
  summarizer = models['summarizer']
268
 
269
+ # First, generate a quick summary
270
  max_input_length = 1024 # Model limit
271
 
272
+ # Only summarize the first portion of text for speed
273
+ text_to_summarize = resume_text[:min(len(resume_text), max_input_length)]
274
+ base_summary = summarizer(text_to_summarize)[0]['summary_text']
275
+
276
+ # Extract information in parallel where possible
277
+ with concurrent.futures.ThreadPoolExecutor() as executor:
278
+ # These can run in parallel
279
+ name_future = executor.submit(extract_name, resume_text[:500]) # Only use start of text
280
+ age_future = executor.submit(extract_age, resume_text)
281
+ industry_future = executor.submit(extract_industry, resume_text, base_summary)
282
+ skills_work_future = executor.submit(extract_skills_and_work, resume_text)
283
 
284
+ # Get results
285
+ name = name_future.result()
286
+ age = age_future.result()
287
+ industry = industry_future.result()
288
+ skills, work_experience = skills_work_future.result()
 
 
 
 
 
 
 
 
 
289
 
290
  # Format the structured summary
291
  formatted_summary = f"Name: {name}\n"
 
299
  return formatted_summary, execution_time
300
 
301
  #####################################
302
+ # Function: Compare Candidate Summary to Company Prompt - Optimized
303
  #####################################
304
+ @st.cache_data(show_spinner=False)
305
  def compute_suitability(candidate_summary, company_prompt, models):
306
  """
307
  Compute the similarity between candidate summary and company prompt.
 
311
 
312
  feature_extractor = models['feature_extractor']
313
 
314
+ # Extract features (embeddings) - parallelize this
315
+ with concurrent.futures.ThreadPoolExecutor() as executor:
316
+ candidate_future = executor.submit(feature_extractor, candidate_summary)
317
+ company_future = executor.submit(feature_extractor, company_prompt)
318
+
319
+ candidate_features = candidate_future.result()
320
+ company_features = company_future.result()
321
 
322
  # Convert to numpy arrays and flatten if needed
323
  candidate_vec = np.mean(np.array(candidate_features[0]), axis=0)
 
331
  return similarity, execution_time
332
 
333
  #####################################
334
+ # Main Streamlit Interface - with Progress Reporting
335
  #####################################
336
  st.title("Resume Analyzer and Company Suitability Checker")
337
  st.markdown(
 
353
  help="Enter a detailed description of the company culture, role requirements, and desired skills.",
354
  )
355
 
356
+ # Process button with optimized flow
357
  if uploaded_file is not None and company_prompt and st.button("Analyze Resume"):
358
+ # Create a placeholder for the progress bar
359
+ progress_bar = st.progress(0)
360
+ status_text = st.empty()
361
+
362
+ # Step 1: Extract text
363
+ status_text.text("Step 1/3: Extracting text from resume...")
364
+ resume_text = extract_text_from_file(uploaded_file)
365
+ progress_bar.progress(25)
366
+
367
+ if resume_text.startswith("Error") or resume_text == "Unsupported file type. Please upload a .docx, .doc, or .txt file.":
368
+ st.error(resume_text)
369
+ else:
370
+ # Step 2: Generate summary
371
+ status_text.text("Step 2/3: Analyzing resume and generating summary...")
372
+ summary, summarization_time = summarize_resume_text(resume_text, models)
373
+ progress_bar.progress(75)
374
+
375
+ # Display summary
376
+ st.subheader("Candidate Summary")
377
+ st.markdown(summary)
378
+ st.info(f"Summary generated in {summarization_time:.2f} seconds")
379
+
380
+ # Step 3: Compute similarity
381
+ status_text.text("Step 3/3: Calculating compatibility with company profile...")
382
+ similarity_score, similarity_time = compute_suitability(summary, company_prompt, models)
383
+ progress_bar.progress(100)
384
+
385
+ # Clear status messages
386
+ status_text.empty()
387
 
388
+ # Display similarity score
389
+ st.subheader("Suitability Assessment")
390
+ st.markdown(f"**Matching Score:** {similarity_score:.2%}")
391
+ st.info(f"Compatibility assessment completed in {similarity_time:.2f} seconds")
392
+
393
+ # Provide interpretation
394
+ if similarity_score >= 0.85:
395
+ st.success("Excellent match! This candidate's profile is strongly aligned with the company requirements.")
396
+ elif similarity_score >= 0.70:
397
+ st.success("Good match! This candidate shows strong potential for the position.")
398
+ elif similarity_score >= 0.50:
399
+ st.warning("Moderate match. The candidate meets some requirements but there may be gaps.")
400
  else:
401
+ st.error("Low match. The candidate's profile may not align well with the requirements.")