CR7CAD commited on
Commit
189287e
·
verified ·
1 Parent(s): 885deab

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +282 -307
app.py CHANGED
@@ -1,11 +1,14 @@
1
- import os, io, re, time, tempfile
2
  import streamlit as st
3
- import docx, docx2txt
 
 
 
 
4
  import pandas as pd
5
  from functools import lru_cache
6
- import random # For reproducible randomization in scoring
7
 
8
- # Handle imports
9
  try:
10
  from transformers import pipeline
11
  has_pipeline = True
@@ -13,30 +16,33 @@ except ImportError:
13
  from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForSeq2SeqLM
14
  import torch
15
  has_pipeline = False
 
16
 
17
- # Setup page
18
  st.set_page_config(page_title="Resume-Job Fit Analyzer", initial_sidebar_state="collapsed")
19
- st.markdown("""<style>[data-testid="collapsedControl"],[data-testid="stSidebar"] {display: none;}</style>""", unsafe_allow_html=True)
20
 
21
  #####################################
22
  # Model Loading & Text Processing
23
  #####################################
24
- @st.cache_resource
25
  def load_models():
26
- with st.spinner("Loading AI models..."):
27
  models = {}
 
28
  # Load summarization model
29
  if has_pipeline:
30
- models['summarizer'] = pipeline("summarization", model="Falconsai/text_summarization", max_length=100)
31
  else:
32
  try:
33
  models['summarizer_model'] = AutoModelForSeq2SeqLM.from_pretrained("Falconsai/text_summarization")
34
  models['summarizer_tokenizer'] = AutoTokenizer.from_pretrained("Falconsai/text_summarization")
35
  except Exception as e:
36
  st.error(f"Error loading summarization model: {e}")
37
- models['summarizer_model'] = models['summarizer_tokenizer'] = None
 
38
 
39
- # Load evaluation model
40
  if has_pipeline:
41
  models['evaluator'] = pipeline("sentiment-analysis", model="CR7CAD/RobertaFinetuned")
42
  else:
@@ -45,426 +51,394 @@ def load_models():
45
  models['evaluator_tokenizer'] = AutoTokenizer.from_pretrained("CR7CAD/RobertaFinetuned")
46
  except Exception as e:
47
  st.error(f"Error loading sentiment model: {e}")
48
- models['evaluator_model'] = models['evaluator_tokenizer'] = None
 
 
49
  return models
50
 
51
  def summarize_text(text, models, max_length=100):
52
- """Summarize text with fallbacks"""
53
- input_text = text[:1024]
54
 
55
- # Try pipeline
56
  if has_pipeline and 'summarizer' in models:
57
  try:
58
  return models['summarizer'](input_text)[0]['summary_text']
59
- except: pass
 
60
 
61
- # Try manual model
62
  if 'summarizer_model' in models and models['summarizer_model']:
63
  try:
64
  tokenizer = models['summarizer_tokenizer']
65
  model = models['summarizer_model']
66
  inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024)
67
- summary_ids = model.generate(inputs.input_ids, max_length=max_length, min_length=30, num_beams=4)
68
  return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
69
- except: pass
 
70
 
71
- # Fallback - extract sentences
72
  sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
73
- scored = [(1.0/(i+1), s) for i, s in enumerate(sentences) if len(s.split()) >= 4]
74
- scored.sort(reverse=True)
75
-
76
- result, length = [], 0
77
- for _, sentence in scored:
78
- if length + len(sentence.split()) <= max_length:
79
- result.append(sentence)
80
- length += len(sentence.split())
81
-
82
- if result:
83
- ordered = sorted([(sentences.index(s), s) for s in result])
84
- return " ".join(s for _, s in ordered)
85
- return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
  #####################################
88
- # File Processing & Information Extraction
89
  #####################################
90
- @st.cache_data
91
  def extract_text_from_file(file_obj):
92
- ext = os.path.splitext(file_obj.name)[1].lower()
 
93
 
94
  if ext == ".docx":
95
  try:
96
  document = docx.Document(file_obj)
97
- return "\n".join(para.text for para in document.paragraphs if para.text.strip())[:15000]
98
  except Exception as e:
99
  return f"Error processing DOCX file: {e}"
100
  elif ext == ".doc":
101
  try:
102
  with tempfile.NamedTemporaryFile(delete=False, suffix='.doc') as temp_file:
103
  temp_file.write(file_obj.getvalue())
104
- text = docx2txt.process(temp_file.name)
105
- os.unlink(temp_file.name)
106
- return text[:15000]
 
 
 
 
 
 
107
  except Exception as e:
108
  return f"Error processing DOC file: {e}"
109
  elif ext == ".txt":
110
  try:
111
- return file_obj.getvalue().decode("utf-8")[:15000]
112
  except Exception as e:
113
  return f"Error processing TXT file: {e}"
114
  else:
115
  return "Unsupported file type. Please upload a .docx, .doc, or .txt file."
116
 
117
- # Information extraction functions
118
- def extract_skills(text):
119
- """Extract skills from text"""
120
- text_lower = text.lower()
121
-
122
- # Define common skills
123
- skills_list = [
124
- "Python", "Java", "JavaScript", "HTML", "CSS", "SQL", "C++", "C#", "Go", "R",
125
- "React", "Angular", "Vue", "Node.js", "jQuery", "Bootstrap", "PHP", "Ruby",
126
- "Machine Learning", "Data Analysis", "Big Data", "AI", "NLP", "Deep Learning",
127
- "MySQL", "MongoDB", "PostgreSQL", "Oracle", "Database", "ETL",
128
- "AWS", "Azure", "Google Cloud", "Docker", "Kubernetes", "CI/CD", "DevOps",
129
- "Git", "GitHub", "Agile", "Scrum", "Jira", "RESTful API", "GraphQL",
130
- "TensorFlow", "PyTorch", "SAS", "SPSS", "Tableau", "Power BI", "Excel",
131
- "Communication", "Teamwork", "Problem Solving", "Critical Thinking",
132
- "Leadership", "Project Management", "Time Management", "Flexibility", "Adaptability"
133
- ]
134
-
135
- # Extract matched skills
136
- found_skills = []
137
- for skill in skills_list:
138
- if skill.lower() in text_lower or re.search(r'\b' + re.escape(skill.lower()) + r'(?:\s|\b|ing|er)', text_lower):
139
- found_skills.append(skill)
140
-
141
- return list(set(found_skills)) # Remove duplicates
142
-
143
  @lru_cache(maxsize=32)
144
  def extract_name(text_start):
145
- lines = [line.strip() for line in text_start.split('\n')[:5] if line.strip()]
146
 
147
  if lines:
148
  first_line = lines[0]
149
- if 5 <= len(first_line) <= 40 and not any(x in first_line.lower() for x in ["resume", "cv", "curriculum", "vitae"]):
150
  return first_line
151
 
152
  for line in lines[:3]:
153
  if len(line.split()) <= 4 and not any(x in line.lower() for x in ["address", "phone", "email", "resume", "cv"]):
154
  return line
155
- return "Unknown"
 
156
 
157
  def extract_age(text):
158
- for pattern in [r'age:?\s*(\d{1,2})', r'(\d{1,2})\s*years\s*old', r'dob:.*(\d{4})', r'date of birth:.*(\d{4})']:
 
 
159
  match = re.search(pattern, text.lower())
160
  if match:
161
- if len(match.group(1)) == 4: # Birth year
162
- try: return str(2025 - int(match.group(1)))
163
- except: pass
 
 
164
  return match.group(1)
 
165
  return "Not specified"
166
 
167
  def extract_industry(text):
168
  industries = {
169
- "Technology": ["software", "programming", "developer", "IT", "tech", "computer", "digital"],
170
- "Finance": ["banking", "financial", "accounting", "finance", "analyst"],
171
- "Healthcare": ["medical", "health", "hospital", "clinical", "nurse", "doctor"],
172
- "Education": ["teaching", "teacher", "professor", "education", "university", "school"],
173
- "Marketing": ["marketing", "advertising", "digital marketing", "social media", "brand"],
174
- "Engineering": ["engineer", "engineering", "mechanical", "civil", "electrical"],
175
- "Data Science": ["data science", "machine learning", "AI", "analytics", "big data"],
176
- "Management": ["manager", "management", "leadership", "executive", "director"]
177
  }
178
 
179
  text_lower = text.lower()
180
- counts = {ind: sum(text_lower.count(kw) for kw in kws) for ind, kws in industries.items()}
181
- return max(counts.items(), key=lambda x: x[1])[0] if any(counts.values()) else "Not specified"
 
 
 
 
 
182
 
183
  def extract_job_position(text):
184
- text_lower = text.lower()
185
- for pattern in [r'objective:?\s*(.*?)(?=\n\n|\n\w+:|\Z)', r'career\s*objective:?\s*(.*?)(?=\n\n|\n\w+:|\Z)',
186
- r'summary:?\s*(.*?)(?=\n\n|\n\w+:|\Z)', r'seeking.*position.*as\s*([^.]*)']:
187
- match = re.search(pattern, text_lower, re.IGNORECASE | re.DOTALL)
 
 
 
188
  if match:
189
- text = match.group(1).strip()
190
- for title in ["developer", "engineer", "analyst", "manager", "specialist", "designer"]:
191
- if title in text:
192
- return next((m.group(1).strip().title() for m in
193
- [re.search(r'(\w+\s+' + title + r')', text)] if m), title.title())
194
- return " ".join(text.split()[:10]).title() + "..." if len(text.split()) > 10 else text.title()
195
-
196
- # Check for job title near experience
197
- for pattern in [r'experience:.*?(\w+\s+\w+(?:\s+\w+)?)(?=\s*at|\s*\()', r'(\w+\s+\w+(?:\s+\w+)?)\s*\(\s*(?:current|present)']:
198
- match = re.search(pattern, text_lower, re.IGNORECASE)
199
- if match: return match.group(1).strip().title()
 
 
 
200
 
201
- return "Not specified"
 
 
 
 
 
 
 
 
 
202
 
203
- #####################################
204
- # Core Analysis Functions
205
- #####################################
206
  def summarize_resume_text(resume_text, models):
207
- start = time.time()
208
 
209
- # Basic info extraction
210
  name = extract_name(resume_text[:500])
211
  age = extract_age(resume_text)
212
  industry = extract_industry(resume_text)
213
  job_position = extract_job_position(resume_text)
214
  skills = extract_skills(resume_text)
215
 
216
- # Generate summary
217
  try:
218
  if has_pipeline and 'summarizer' in models:
219
- model_summary = models['summarizer'](resume_text[:2000], max_length=100, min_length=30)[0]['summary_text']
220
  else:
221
  model_summary = summarize_text(resume_text, models, max_length=100)
222
- except:
223
- model_summary = "Error generating summary."
 
224
 
225
- # Format result
226
- summary = f"Name: {name}\n\nAge: {age}\n\nExpected Industry: {industry}\n\n"
227
- summary += f"Expected Job Position: {job_position}\n\nSkills: {', '.join(skills)}\n\nSummary: {model_summary}"
228
 
229
- return summary, time.time() - start
230
 
231
  def extract_job_requirements(job_description, models):
232
- # Use the same skills list for consistency
233
- skills_list = [
234
- "Python", "Java", "JavaScript", "HTML", "CSS", "SQL", "C++", "C#", "Go", "R",
235
- "React", "Angular", "Vue", "Node.js", "jQuery", "Bootstrap", "PHP", "Ruby",
236
- "Machine Learning", "Data Analysis", "Big Data", "AI", "NLP", "Deep Learning",
237
- "MySQL", "MongoDB", "PostgreSQL", "Oracle", "Database", "ETL",
238
- "AWS", "Azure", "Google Cloud", "Docker", "Kubernetes", "CI/CD", "DevOps",
239
- "Git", "GitHub", "Agile", "Scrum", "Jira", "RESTful API", "GraphQL",
240
- "TensorFlow", "PyTorch", "SAS", "SPSS", "Tableau", "Power BI", "Excel",
241
- "Communication", "Teamwork", "Problem Solving", "Critical Thinking",
242
- "Leadership", "Project Management", "Time Management", "Flexibility", "Adaptability"
243
  ]
244
 
245
- clean_text = job_description.lower()
246
 
247
- # Extract job title
 
248
  job_title = "Not specified"
249
- for pattern in [r'^([^:.\n]+?)(position|role|job)', r'^([^:.\n]+?)\n', r'hiring.*? ([^:.\n]+?)(:-|[.:]|\n|$)']:
250
- match = re.search(pattern, clean_text, re.IGNORECASE)
251
- if match:
252
- title = match.group(1).strip() if len(match.groups()) >= 1 else match.group(2).strip()
253
- if 3 <= len(title) <= 50:
254
- job_title = title.capitalize()
255
- break
256
-
257
- # Extract years required
258
- years_required = 0
259
- for pattern in [r'(\d+)(?:\+)?\s*(?:years|yrs).*?experience', r'experience.*?(\d+)(?:\+)?\s*(?:years|yrs)']:
260
- match = re.search(pattern, clean_text, re.IGNORECASE)
261
- if match:
262
- try:
263
- years_required = int(match.group(1))
264
- break
265
- except: pass
266
 
267
  # Extract skills
268
- required_skills = []
269
- for skill in skills_list:
270
- if skill.lower() in clean_text or re.search(r'\b' + re.escape(skill.lower()) + r'(?:\s|\b|ing|er)', clean_text):
271
- required_skills.append(skill)
272
 
273
- # Ensure at least some skills are found
274
  if not required_skills:
275
- words = [w for w in re.findall(r'\b\w{4,}\b', clean_text)
276
- if w not in ["with", "that", "this", "have", "from", "they", "will", "what", "your"]]
277
  word_counts = {}
278
- for w in words: word_counts[w] = word_counts.get(w, 0) + 1
279
- required_skills = [w.capitalize() for w, _ in sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:5]]
 
 
 
 
 
280
 
281
  return {
282
  "title": job_title,
283
  "years_experience": years_required,
284
  "required_skills": required_skills,
285
- "summary": summarize_text(job_description, models, max_length=100)
286
- }
287
-
288
- def evaluate_job_fit(resume_summary, job_requirements, models):
289
- start = time.time()
290
-
291
- # Set seed for consistent but varied evaluation
292
- random.seed(resume_summary[:20]) # Use part of resume text as seed
293
-
294
- # Basic extraction
295
- required_skills = job_requirements["required_skills"]
296
- years_required = job_requirements["years_experience"]
297
- job_title = job_requirements["title"]
298
- skills_mentioned = extract_skills(resume_summary)
299
-
300
- # Calculate matches
301
- matching_skills = [skill for skill in required_skills if skill in skills_mentioned]
302
-
303
- # BALANCED SCORING ALGORITHM
304
-
305
- # 1. Skill match score - linear with slight noise
306
- if not required_skills:
307
- skill_match = random.uniform(0.4, 0.6) # Random value if no skills required
308
- else:
309
- # Base score is the actual match percentage
310
- raw_match = len(matching_skills) / len(required_skills)
311
- # Add slight variance to create more distribution
312
- skill_match = max(0, min(1, raw_match + random.uniform(-0.1, 0.1)))
313
-
314
- # 2. Experience match - closer to realistic assessment
315
- years_experience = 0
316
- exp_match = re.search(r'(\d+)\+?\s*years?\s*(?:of)?\s*experience', resume_summary, re.IGNORECASE)
317
- if exp_match:
318
- try: years_experience = int(exp_match.group(1))
319
- except: pass
320
-
321
- if years_required == 0:
322
- # If no experience required, slight preference for experienced candidates
323
- exp_match_ratio = random.uniform(0.5, 0.8) if years_experience > 0 else random.uniform(0.3, 0.6)
324
- else:
325
- # For jobs with required experience
326
- ratio = years_experience / max(1, years_required)
327
-
328
- if ratio < 0.6: # Significantly underqualified
329
- exp_match_ratio = random.uniform(0.2, 0.4)
330
- elif ratio < 0.9: # Slightly underqualified - potential fit territory
331
- exp_match_ratio = random.uniform(0.4, 0.6)
332
- elif ratio <= 1.5: # Just right - good fit territory
333
- exp_match_ratio = random.uniform(0.7, 0.9)
334
- else: # Overqualified - could be good or potential
335
- exp_match_ratio = random.uniform(0.6, 0.8)
336
-
337
- # 3. Title matching - realistic assessment
338
- title_words = [w for w in job_title.lower().split() if len(w) > 3]
339
-
340
- if not title_words:
341
- title_match = random.uniform(0.4, 0.6) # Random if no meaningful title words
342
- else:
343
- matches = 0
344
- for word in title_words:
345
- if word in resume_summary.lower():
346
- matches += 1
347
- # Look for similar words
348
- elif any(w.startswith(word[:4]) for w in resume_summary.lower().split() if len(w) > 3):
349
- matches += 0.5
350
-
351
- # Calculate raw match and add slight variance
352
- raw_title_match = matches / len(title_words)
353
- title_match = max(0, min(1, raw_title_match + random.uniform(-0.1, 0.1)))
354
-
355
- # Convert to 0-2 scale with slight adjustments for better distribution
356
- skill_score = skill_match * 2.0
357
- exp_score = exp_match_ratio * 2.0
358
- title_score = title_match * 2.0
359
-
360
- # Extract candidate info
361
- name = re.search(r'Name:\s*(.*?)(?=\n|\Z)', resume_summary)
362
- name = name.group(1).strip() if name else "The candidate"
363
-
364
- industry = re.search(r'Expected Industry:\s*(.*?)(?=\n|\Z)', resume_summary)
365
- industry = industry.group(1).strip() if industry else "unspecified industry"
366
-
367
- # Calculate weighted score - balanced weights
368
- weighted_score = (skill_score * 0.45) + (exp_score * 0.35) + (title_score * 0.20)
369
-
370
- # Small random adjustment to increase distribution variety
371
- # This creates more natural variation in scores
372
- weighted_score = max(0, min(2, weighted_score + random.uniform(-0.15, 0.15)))
373
-
374
- # Set thresholds for better distribution across categories
375
- # These thresholds aim for roughly equal distribution on average
376
- if weighted_score >= 1.2:
377
- fit_score = 2 # Good fit (roughly 33% of cases)
378
- elif weighted_score >= 0.7:
379
- fit_score = 1 # Potential fit (roughly 33% of cases)
380
- else:
381
- fit_score = 0 # Not a fit (roughly 33% of cases)
382
-
383
- # Store debug info
384
- st.session_state['debug_scores'] = {
385
- 'skill_match': skill_match,
386
- 'skill_score': skill_score,
387
- 'exp_match_ratio': exp_match_ratio,
388
- 'exp_score': exp_score,
389
- 'title_match': title_match,
390
- 'title_score': title_score,
391
- 'weighted_score': weighted_score,
392
- 'fit_score': fit_score,
393
- 'matching_skills': matching_skills,
394
- 'required_skills': required_skills,
395
- 'skill_percentage': f"{len(matching_skills)}/{len(required_skills)}"
396
  }
397
-
398
- # Generate assessment
399
- missing = [skill for skill in required_skills if skill not in skills_mentioned]
400
-
401
- if fit_score == 2:
402
- assessment = f"{fit_score}: GOOD FIT - {name} demonstrates strong alignment with the {job_title} position. Their background in {industry} appears well-suited for this role's requirements."
403
- elif fit_score == 1:
404
- assessment = f"{fit_score}: POTENTIAL FIT - {name} shows potential for the {job_title} role but has gaps in certain areas. Additional training might be needed in {', '.join(missing[:2])}."
405
- else:
406
- assessment = f"{fit_score}: NO FIT - {name}'s background shows limited alignment with this {job_title} position. Their experience and skills differ significantly from the requirements."
407
-
408
- return assessment, fit_score, time.time() - start
409
 
410
  def analyze_job_fit(resume_summary, job_description, models):
411
- start = time.time()
412
  job_requirements = extract_job_requirements(job_description, models)
413
- assessment, fit_score, _ = evaluate_job_fit(resume_summary, job_requirements, models)
414
- return assessment, fit_score, time.time() - start
 
 
 
415
 
416
  #####################################
417
- # Main Function
418
  #####################################
419
  def main():
420
- # Initialize session state for debug info
421
- if 'debug_scores' not in st.session_state:
422
- st.session_state['debug_scores'] = {}
423
-
424
  st.title("Resume-Job Fit Analyzer")
425
- st.markdown("Upload your resume file in **.docx**, **.doc**, or **.txt** format and enter a job description to see how well you match.")
 
 
 
 
 
 
 
426
 
427
- # Load models and get inputs
428
- models = load_models()
429
- uploaded_file = st.file_uploader("Upload your resume", type=["docx", "doc", "txt"])
430
  job_description = st.text_area("Enter Job Description", height=200, placeholder="Paste the job description here...")
431
 
432
- # Process when button clicked
433
- if uploaded_file and job_description and st.button("Analyze Job Fit"):
434
- progress = st.progress(0)
435
- status = st.empty()
 
436
 
437
  # Step 1: Extract text
438
- status.text("Step 1/3: Extracting text from resume...")
439
  resume_text = extract_text_from_file(uploaded_file)
440
- progress.progress(25)
441
 
442
  if resume_text.startswith("Error") or resume_text == "Unsupported file type. Please upload a .docx, .doc, or .txt file.":
443
  st.error(resume_text)
444
  else:
445
  # Step 2: Generate summary
446
- status.text("Step 2/3: Analyzing resume...")
447
- summary, summary_time = summarize_resume_text(resume_text, models)
448
- progress.progress(50)
 
 
449
  st.subheader("Your Resume Summary")
450
  st.markdown(summary)
451
 
452
- # Step 3: Evaluate fit
453
- status.text("Step 3/3: Evaluating job fit...")
454
- assessment, fit_score, eval_time = analyze_job_fit(summary, job_description, models)
455
- progress.progress(100)
456
- status.empty()
457
 
458
- # Display results
 
 
 
459
  st.subheader("Job Fit Assessment")
460
- fit_labels = {0: "NOT FIT", 1: "POTENTIAL FIT", 2: "GOOD FIT"}
461
- colors = {0: "red", 1: "orange", 2: "green"}
462
- st.markdown(f"<h2 style='color: {colors[fit_score]};'>{fit_labels[fit_score]}</h2>", unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
463
  st.markdown(assessment)
464
- st.info(f"Analysis completed in {(summary_time + eval_time):.2f} seconds")
 
465
 
466
- # Recommendations
467
  st.subheader("Recommended Next Steps")
 
468
  if fit_score == 2:
469
  st.markdown("""
470
  - Apply for this position as you appear to be a good match
@@ -484,5 +458,6 @@ def main():
484
  - Consider similar roles with fewer experience requirements
485
  """)
486
 
 
487
  if __name__ == "__main__":
488
  main()
 
1
+ import os
2
  import streamlit as st
3
+ import docx
4
+ import docx2txt
5
+ import tempfile
6
+ import time
7
+ import re
8
  import pandas as pd
9
  from functools import lru_cache
 
10
 
11
+ # Simplify imports
12
  try:
13
  from transformers import pipeline
14
  has_pipeline = True
 
16
  from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForSeq2SeqLM
17
  import torch
18
  has_pipeline = False
19
+ st.warning("Using basic transformers functionality instead of pipeline API")
20
 
21
+ # Page setup
22
  st.set_page_config(page_title="Resume-Job Fit Analyzer", initial_sidebar_state="collapsed")
23
+ st.markdown("<style>[data-testid='collapsedControl'] {display: none;} section[data-testid='stSidebar'] {display: none;}</style>", unsafe_allow_html=True)
24
 
25
  #####################################
26
  # Model Loading & Text Processing
27
  #####################################
28
+ @st.cache_resource(show_spinner=True)
29
  def load_models():
30
+ with st.spinner("Loading AI models... This may take a minute on first run."):
31
  models = {}
32
+
33
  # Load summarization model
34
  if has_pipeline:
35
+ models['summarizer'] = pipeline("summarization", model="Falconsai/text_summarization", max_length=100, truncation=True)
36
  else:
37
  try:
38
  models['summarizer_model'] = AutoModelForSeq2SeqLM.from_pretrained("Falconsai/text_summarization")
39
  models['summarizer_tokenizer'] = AutoTokenizer.from_pretrained("Falconsai/text_summarization")
40
  except Exception as e:
41
  st.error(f"Error loading summarization model: {e}")
42
+ models['summarizer_model'] = None
43
+ models['summarizer_tokenizer'] = None
44
 
45
+ # Load sentiment model
46
  if has_pipeline:
47
  models['evaluator'] = pipeline("sentiment-analysis", model="CR7CAD/RobertaFinetuned")
48
  else:
 
51
  models['evaluator_tokenizer'] = AutoTokenizer.from_pretrained("CR7CAD/RobertaFinetuned")
52
  except Exception as e:
53
  st.error(f"Error loading sentiment model: {e}")
54
+ models['evaluator_model'] = None
55
+ models['evaluator_tokenizer'] = None
56
+
57
  return models
58
 
59
  def summarize_text(text, models, max_length=100):
60
+ input_text = text[:1024] # Limit input length
 
61
 
 
62
  if has_pipeline and 'summarizer' in models:
63
  try:
64
  return models['summarizer'](input_text)[0]['summary_text']
65
+ except Exception as e:
66
+ st.warning(f"Error in pipeline summarization: {e}")
67
 
 
68
  if 'summarizer_model' in models and models['summarizer_model']:
69
  try:
70
  tokenizer = models['summarizer_tokenizer']
71
  model = models['summarizer_model']
72
  inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024)
73
+ summary_ids = model.generate(inputs.input_ids, max_length=max_length, min_length=30, num_beams=4, early_stopping=True)
74
  return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
75
+ except Exception as e:
76
+ st.warning(f"Error in manual summarization: {e}")
77
 
78
+ # Fallback to basic summarization
79
  sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
80
+ scored_sentences = []
81
+ for i, sentence in enumerate(sentences):
82
+ if len(sentence.split()) < 4: continue
83
+ score = 1.0 / (i + 1) - (0.01 * max(0, len(sentence.split()) - 20))
84
+ scored_sentences.append((score, sentence))
85
+
86
+ scored_sentences.sort(reverse=True)
87
+ summary_sentences, current_length = [], 0
88
+
89
+ for _, sentence in scored_sentences:
90
+ if current_length + len(sentence.split()) <= max_length:
91
+ summary_sentences.append(sentence)
92
+ current_length += len(sentence.split())
93
+ else:
94
+ break
95
+
96
+ if summary_sentences:
97
+ original_order = sorted([(sentences.index(s), s) for s in summary_sentences])
98
+ summary_sentences = [s for _, s in original_order]
99
+
100
+ return " ".join(summary_sentences)
101
+
102
+ # Keep job fit evaluation function intact as it's critical
103
+ def evaluate_job_fit(resume_summary, job_requirements, models):
104
+ start_time = time.time()
105
+
106
+ required_skills = job_requirements["required_skills"]
107
+ years_required = job_requirements["years_experience"]
108
+ job_title = job_requirements["title"]
109
+ job_summary = job_requirements["summary"]
110
+
111
+ skills_mentioned = extract_skills(resume_summary)
112
+
113
+ matching_skills = [skill for skill in required_skills if skill in skills_mentioned]
114
+ skill_match_percentage = len(matching_skills) / len(required_skills) if required_skills else 0
115
+
116
+ experience_pattern = r'(\d+)\+?\s*years?\s*(?:of)?\s*experience'
117
+ experience_match = re.search(experience_pattern, resume_summary, re.IGNORECASE)
118
+ years_experience = 0
119
+ if experience_match:
120
+ try:
121
+ years_experience = int(experience_match.group(1))
122
+ except:
123
+ years_experience = 0
124
+
125
+ exp_match_ratio = min(1.0, years_experience / max(1, years_required)) if years_required > 0 else 0.5
126
+
127
+ job_title_lower = job_title.lower()
128
+ title_match = 0
129
+
130
+ title_words = [word for word in job_title_lower.split() if len(word) > 3]
131
+ title_matches = sum(1 for word in title_words if word in resume_summary.lower())
132
+ title_match = title_matches / len(title_words) if title_words else 0
133
+
134
+ skill_score = min(2, skill_match_percentage * 3)
135
+ exp_score = min(2, exp_match_ratio * 2)
136
+ title_score = min(2, title_match * 2)
137
+
138
+ name_match = re.search(r'Name:\s*(.*?)(?=\n|\Z)', resume_summary)
139
+ name = name_match.group(1).strip() if name_match else "The candidate"
140
+
141
+ age_match = re.search(r'Age:\s*(.*?)(?=\n|\Z)', resume_summary)
142
+ age = age_match.group(1).strip() if age_match else "unspecified age"
143
+
144
+ industry_match = re.search(r'Expected Industry:\s*(.*?)(?=\n|\Z)', resume_summary)
145
+ industry = industry_match.group(1).strip() if industry_match else "unspecified industry"
146
+
147
+ weighted_score = (skill_score * 0.5) + (exp_score * 0.3) + (title_score * 0.2)
148
+
149
+ if weighted_score >= 1.5:
150
+ fit_score = 2 # Good fit
151
+ elif weighted_score >= 0.8:
152
+ fit_score = 1 # Potential fit
153
+ else:
154
+ fit_score = 0 # Not a fit
155
+
156
+ missing_skills = [skill for skill in required_skills if skill not in skills_mentioned]
157
+
158
+ if fit_score == 2:
159
+ fit_assessment = f"{fit_score}: GOOD FIT - {name} demonstrates strong alignment with the {job_title} position. Their background in {industry} and professional experience appear well-suited for this role's requirements. The technical expertise matches what the position demands."
160
+ elif fit_score == 1:
161
+ fit_assessment = f"{fit_score}: POTENTIAL FIT - {name} shows potential for the {job_title} role with some relevant experience, though there are gaps in certain technical areas. Their {industry} background provides partial alignment with the position requirements. Additional training might be needed in {', '.join(missing_skills[:2])} if pursuing this opportunity."
162
+ else:
163
+ fit_assessment = f"{fit_score}: NO FIT - {name}'s current background shows limited alignment with this {job_title} position. Their experience level and technical background differ significantly from the role requirements. A position better matching their {industry} expertise might be more suitable."
164
+
165
+ execution_time = time.time() - start_time
166
+
167
+ return fit_assessment, fit_score, execution_time
168
 
169
  #####################################
170
+ # File & Information Extraction
171
  #####################################
172
+ @st.cache_data(show_spinner=False)
173
  def extract_text_from_file(file_obj):
174
+ filename = file_obj.name
175
+ ext = os.path.splitext(filename)[1].lower()
176
 
177
  if ext == ".docx":
178
  try:
179
  document = docx.Document(file_obj)
180
+ return "\n".join(para.text for para in document.paragraphs if para.text.strip())
181
  except Exception as e:
182
  return f"Error processing DOCX file: {e}"
183
  elif ext == ".doc":
184
  try:
185
  with tempfile.NamedTemporaryFile(delete=False, suffix='.doc') as temp_file:
186
  temp_file.write(file_obj.getvalue())
187
+ temp_path = temp_file.name
188
+
189
+ try:
190
+ text = docx2txt.process(temp_path)
191
+ except Exception:
192
+ text = "Could not process .doc file. Please convert to .docx format."
193
+
194
+ os.unlink(temp_path)
195
+ return text
196
  except Exception as e:
197
  return f"Error processing DOC file: {e}"
198
  elif ext == ".txt":
199
  try:
200
+ return file_obj.getvalue().decode("utf-8")
201
  except Exception as e:
202
  return f"Error processing TXT file: {e}"
203
  else:
204
  return "Unsupported file type. Please upload a .docx, .doc, or .txt file."
205
 
206
+ # Simplified information extraction functions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  @lru_cache(maxsize=32)
208
  def extract_name(text_start):
209
+ lines = [line.strip() for line in text_start.split('\n') if line.strip()][:5]
210
 
211
  if lines:
212
  first_line = lines[0]
213
+ if 5 <= len(first_line) <= 40 and not any(x in first_line.lower() for x in ["resume", "cv", "curriculum", "vitae", "profile"]):
214
  return first_line
215
 
216
  for line in lines[:3]:
217
  if len(line.split()) <= 4 and not any(x in line.lower() for x in ["address", "phone", "email", "resume", "cv"]):
218
  return line
219
+
220
+ return "Unknown (please extract from resume)"
221
 
222
  def extract_age(text):
223
+ age_patterns = [r'age:?\s*(\d{1,2})', r'(\d{1,2})\s*years\s*old', r'dob:.*(\d{4})', r'date of birth:.*(\d{4})']
224
+
225
+ for pattern in age_patterns:
226
  match = re.search(pattern, text.lower())
227
  if match:
228
+ if len(match.group(1)) == 4: # Year of birth
229
+ try:
230
+ return str(2025 - int(match.group(1)))
231
+ except:
232
+ pass
233
  return match.group(1)
234
+
235
  return "Not specified"
236
 
237
  def extract_industry(text):
238
  industries = {
239
+ "Technology": ["software", "programming", "developer", "IT", "tech"],
240
+ "Finance": ["banking", "financial", "accounting", "finance"],
241
+ "Healthcare": ["medical", "health", "hospital", "clinical"],
242
+ "Education": ["teaching", "education", "university", "school"],
243
+ "Marketing": ["marketing", "advertising", "digital marketing"],
244
+ "Engineering": ["engineer", "engineering", "mechanical"],
245
+ "Data Science": ["data science", "machine learning", "AI"],
246
+ "Management": ["manager", "management", "leadership"]
247
  }
248
 
249
  text_lower = text.lower()
250
+ industry_counts = {industry: sum(text_lower.count(kw.lower()) for kw in keywords)
251
+ for industry, keywords in industries.items()}
252
+
253
+ if industry_counts:
254
+ return max(industry_counts.items(), key=lambda x: x[1])[0]
255
+
256
+ return "Not clearly specified"
257
 
258
  def extract_job_position(text):
259
+ objective_patterns = [
260
+ r'objective:?\s*(.*?)(?=\n\n|\n\w+:|\Z)',
261
+ r'career\s*objective:?\s*(.*?)(?=\n\n|\n\w+:|\Z)'
262
+ ]
263
+
264
+ for pattern in objective_patterns:
265
+ match = re.search(pattern, text.lower(), re.IGNORECASE | re.DOTALL)
266
  if match:
267
+ objective_text = match.group(1).strip()
268
+ job_titles = ["developer", "engineer", "analyst", "manager", "director", "specialist"]
269
+
270
+ for title in job_titles:
271
+ if title in objective_text:
272
+ title_pattern = r'(?:a|an)?\s*(\w+\s+' + title + r'|\w+\s+\w+\s+' + title + r')'
273
+ title_match = re.search(title_pattern, objective_text)
274
+ if title_match:
275
+ return title_match.group(1).strip().title()
276
+ return title.title()
277
+
278
+ if len(objective_text) > 10:
279
+ words = objective_text.split()
280
+ return " ".join(words[:10]).title() + "..." if len(words) > 10 else objective_text.title()
281
 
282
+ return "Not explicitly stated"
283
+
284
+ def extract_skills(text):
285
+ tech_skills = [
286
+ "Python", "Java", "JavaScript", "HTML", "CSS", "SQL", "React", "Angular",
287
+ "Machine Learning", "Data Analysis", "AWS", "Docker", "Git", "Agile",
288
+ "Communication", "Leadership", "Project Management"
289
+ ]
290
+
291
+ return [skill for skill in tech_skills if re.search(r'\b' + re.escape(skill.lower()) + r'\b', text.lower())]
292
 
 
 
 
293
  def summarize_resume_text(resume_text, models):
294
+ start_time = time.time()
295
 
 
296
  name = extract_name(resume_text[:500])
297
  age = extract_age(resume_text)
298
  industry = extract_industry(resume_text)
299
  job_position = extract_job_position(resume_text)
300
  skills = extract_skills(resume_text)
301
 
 
302
  try:
303
  if has_pipeline and 'summarizer' in models:
304
+ model_summary = models['summarizer'](resume_text[:2000], max_length=100, min_length=30, do_sample=False)[0]['summary_text']
305
  else:
306
  model_summary = summarize_text(resume_text, models, max_length=100)
307
+ except Exception as e:
308
+ st.warning(f"Error in resume summarization: {e}")
309
+ model_summary = "Error generating summary. Please check the original resume."
310
 
311
+ formatted_summary = f"Name: {name}\n\nAge: {age}\n\nExpected Industry: {industry}\n\n"
312
+ formatted_summary += f"Expected Job Position: {job_position}\n\nSkills: {', '.join(skills)}\n\nSummary: {model_summary}"
 
313
 
314
+ return formatted_summary, time.time() - start_time
315
 
316
  def extract_job_requirements(job_description, models):
317
+ tech_skills = [
318
+ "Python", "Java", "JavaScript", "SQL", "React", "Angular",
319
+ "Machine Learning", "AWS", "Docker", "Git", "Agile",
320
+ "Communication", "Leadership", "Project Management"
 
 
 
 
 
 
 
321
  ]
322
 
323
+ clean_job_text = job_description.lower()
324
 
325
+ # Extract title
326
+ title_patterns = [r'^([^:.\n]+?)(position|role|job)', r'^([^:.\n]+?)\n']
327
  job_title = "Not specified"
328
+
329
+ for pattern in title_patterns:
330
+ match = re.search(pattern, clean_job_text, re.IGNORECASE)
331
+ if match and 3 <= len(match.group(1).strip()) <= 50:
332
+ job_title = match.group(1).strip().capitalize()
333
+ break
334
+
335
+ # Extract experience
336
+ exp_match = re.search(r'(\d+)(?:\+)?\s*(?:years|yrs)(?:\s*of)?\s*(?:experience|exp)', clean_job_text)
337
+ years_required = int(exp_match.group(1)) if exp_match else 0
 
 
 
 
 
 
 
338
 
339
  # Extract skills
340
+ required_skills = [skill for skill in tech_skills if re.search(r'\b' + re.escape(skill.lower()) + r'\b', clean_job_text)]
 
 
 
341
 
342
+ # If no skills found, extract common words
343
  if not required_skills:
344
+ words = re.findall(r'\b\w{4,}\b', clean_job_text)
 
345
  word_counts = {}
346
+ for word in words:
347
+ if word not in ["with", "that", "this", "have", "from", "they", "will"]:
348
+ word_counts[word] = word_counts.get(word, 0) + 1
349
+
350
+ required_skills = [word.capitalize() for word, _ in sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:5]]
351
+
352
+ job_summary = summarize_text(job_description, models, max_length=100)
353
 
354
  return {
355
  "title": job_title,
356
  "years_experience": years_required,
357
  "required_skills": required_skills,
358
+ "summary": job_summary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
  }
 
 
 
 
 
 
 
 
 
 
 
 
360
 
361
  def analyze_job_fit(resume_summary, job_description, models):
362
+ start_time = time.time()
363
  job_requirements = extract_job_requirements(job_description, models)
364
+ assessment, fit_score, execution_time = evaluate_job_fit(resume_summary, job_requirements, models)
365
+ return assessment, fit_score, time.time() - start_time
366
+
367
+ # Load models at startup
368
+ models = load_models()
369
 
370
  #####################################
371
+ # Main Function - Kept intact
372
  #####################################
373
  def main():
374
+ """Main function for the Streamlit application"""
 
 
 
375
  st.title("Resume-Job Fit Analyzer")
376
+ st.markdown(
377
+ """
378
+ Upload your resume file in **.docx**, **.doc**, or **.txt** format and enter a job description to see how well you match with the job requirements.
379
+ """
380
+ )
381
+
382
+ # Resume upload
383
+ uploaded_file = st.file_uploader("Upload your resume (.docx, .doc, or .txt)", type=["docx", "doc", "txt"])
384
 
385
+ # Job description input
 
 
386
  job_description = st.text_area("Enter Job Description", height=200, placeholder="Paste the job description here...")
387
 
388
+ # Process button with optimized flow
389
+ if uploaded_file is not None and job_description and st.button("Analyze Job Fit"):
390
+ # Create a placeholder for the progress bar
391
+ progress_bar = st.progress(0)
392
+ status_text = st.empty()
393
 
394
  # Step 1: Extract text
395
+ status_text.text("Step 1/3: Extracting text from resume...")
396
  resume_text = extract_text_from_file(uploaded_file)
397
+ progress_bar.progress(25)
398
 
399
  if resume_text.startswith("Error") or resume_text == "Unsupported file type. Please upload a .docx, .doc, or .txt file.":
400
  st.error(resume_text)
401
  else:
402
  # Step 2: Generate summary
403
+ status_text.text("Step 2/3: Analyzing resume and generating summary...")
404
+ summary, summarization_time = summarize_resume_text(resume_text, models)
405
+ progress_bar.progress(50)
406
+
407
+ # Display summary
408
  st.subheader("Your Resume Summary")
409
  st.markdown(summary)
410
 
411
+ # Step 3: Generate job fit assessment
412
+ status_text.text("Step 3/3: Evaluating job fit (this will take a moment)...")
413
+ assessment, fit_score, assessment_time = analyze_job_fit(summary, job_description, models)
414
+ progress_bar.progress(100)
 
415
 
416
+ # Clear status messages
417
+ status_text.empty()
418
+
419
+ # Display job fit results
420
  st.subheader("Job Fit Assessment")
421
+
422
+ # Display fit score with label
423
+ fit_labels = {
424
+ 0: "NO FIT",
425
+ 1: "POTENTIAL FIT",
426
+ 2: "GOOD FIT"
427
+ }
428
+
429
+ # Show the score prominently with appropriate coloring
430
+ score_label = fit_labels[fit_score]
431
+ score_colors = {0: "red", 1: "orange", 2: "green"}
432
+ st.markdown(f"<h2 style='color: {score_colors[fit_score]};'>{score_label}</h2>", unsafe_allow_html=True)
433
+
434
+ # Display assessment
435
  st.markdown(assessment)
436
+
437
+ st.info(f"Analysis completed in {(summarization_time + assessment_time):.2f} seconds")
438
 
439
+ # Add potential next steps based on the fit score
440
  st.subheader("Recommended Next Steps")
441
+
442
  if fit_score == 2:
443
  st.markdown("""
444
  - Apply for this position as you appear to be a good match
 
458
  - Consider similar roles with fewer experience requirements
459
  """)
460
 
461
+ # Run the main function
462
  if __name__ == "__main__":
463
  main()