CR7CAD commited on
Commit
0af81d7
·
verified ·
1 Parent(s): 189287e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +211 -281
app.py CHANGED
@@ -1,14 +1,10 @@
1
- import os
2
  import streamlit as st
3
- import docx
4
- import docx2txt
5
- import tempfile
6
- import time
7
- import re
8
  import pandas as pd
9
  from functools import lru_cache
10
 
11
- # Simplify imports
12
  try:
13
  from transformers import pipeline
14
  has_pipeline = True
@@ -16,33 +12,30 @@ except ImportError:
16
  from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForSeq2SeqLM
17
  import torch
18
  has_pipeline = False
19
- st.warning("Using basic transformers functionality instead of pipeline API")
20
 
21
- # Page setup
22
  st.set_page_config(page_title="Resume-Job Fit Analyzer", initial_sidebar_state="collapsed")
23
- st.markdown("<style>[data-testid='collapsedControl'] {display: none;} section[data-testid='stSidebar'] {display: none;}</style>", unsafe_allow_html=True)
24
 
25
  #####################################
26
  # Model Loading & Text Processing
27
  #####################################
28
- @st.cache_resource(show_spinner=True)
29
  def load_models():
30
- with st.spinner("Loading AI models... This may take a minute on first run."):
31
  models = {}
32
-
33
  # Load summarization model
34
  if has_pipeline:
35
- models['summarizer'] = pipeline("summarization", model="Falconsai/text_summarization", max_length=100, truncation=True)
36
  else:
37
  try:
38
  models['summarizer_model'] = AutoModelForSeq2SeqLM.from_pretrained("Falconsai/text_summarization")
39
  models['summarizer_tokenizer'] = AutoTokenizer.from_pretrained("Falconsai/text_summarization")
40
  except Exception as e:
41
  st.error(f"Error loading summarization model: {e}")
42
- models['summarizer_model'] = None
43
- models['summarizer_tokenizer'] = None
44
 
45
- # Load sentiment model
46
  if has_pipeline:
47
  models['evaluator'] = pipeline("sentiment-analysis", model="CR7CAD/RobertaFinetuned")
48
  else:
@@ -51,394 +44,332 @@ def load_models():
51
  models['evaluator_tokenizer'] = AutoTokenizer.from_pretrained("CR7CAD/RobertaFinetuned")
52
  except Exception as e:
53
  st.error(f"Error loading sentiment model: {e}")
54
- models['evaluator_model'] = None
55
- models['evaluator_tokenizer'] = None
56
-
57
  return models
58
 
59
  def summarize_text(text, models, max_length=100):
60
- input_text = text[:1024] # Limit input length
 
61
 
 
62
  if has_pipeline and 'summarizer' in models:
63
  try:
64
  return models['summarizer'](input_text)[0]['summary_text']
65
- except Exception as e:
66
- st.warning(f"Error in pipeline summarization: {e}")
67
 
 
68
  if 'summarizer_model' in models and models['summarizer_model']:
69
  try:
70
  tokenizer = models['summarizer_tokenizer']
71
  model = models['summarizer_model']
72
  inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024)
73
- summary_ids = model.generate(inputs.input_ids, max_length=max_length, min_length=30, num_beams=4, early_stopping=True)
74
  return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
75
- except Exception as e:
76
- st.warning(f"Error in manual summarization: {e}")
77
 
78
- # Fallback to basic summarization
79
  sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
80
- scored_sentences = []
81
- for i, sentence in enumerate(sentences):
82
- if len(sentence.split()) < 4: continue
83
- score = 1.0 / (i + 1) - (0.01 * max(0, len(sentence.split()) - 20))
84
- scored_sentences.append((score, sentence))
85
-
86
- scored_sentences.sort(reverse=True)
87
- summary_sentences, current_length = [], 0
88
-
89
- for _, sentence in scored_sentences:
90
- if current_length + len(sentence.split()) <= max_length:
91
- summary_sentences.append(sentence)
92
- current_length += len(sentence.split())
93
- else:
94
- break
95
-
96
- if summary_sentences:
97
- original_order = sorted([(sentences.index(s), s) for s in summary_sentences])
98
- summary_sentences = [s for _, s in original_order]
99
-
100
- return " ".join(summary_sentences)
101
-
102
- # Keep job fit evaluation function intact as it's critical
103
- def evaluate_job_fit(resume_summary, job_requirements, models):
104
- start_time = time.time()
105
-
106
- required_skills = job_requirements["required_skills"]
107
- years_required = job_requirements["years_experience"]
108
- job_title = job_requirements["title"]
109
- job_summary = job_requirements["summary"]
110
-
111
- skills_mentioned = extract_skills(resume_summary)
112
-
113
- matching_skills = [skill for skill in required_skills if skill in skills_mentioned]
114
- skill_match_percentage = len(matching_skills) / len(required_skills) if required_skills else 0
115
-
116
- experience_pattern = r'(\d+)\+?\s*years?\s*(?:of)?\s*experience'
117
- experience_match = re.search(experience_pattern, resume_summary, re.IGNORECASE)
118
- years_experience = 0
119
- if experience_match:
120
- try:
121
- years_experience = int(experience_match.group(1))
122
- except:
123
- years_experience = 0
124
-
125
- exp_match_ratio = min(1.0, years_experience / max(1, years_required)) if years_required > 0 else 0.5
126
-
127
- job_title_lower = job_title.lower()
128
- title_match = 0
129
-
130
- title_words = [word for word in job_title_lower.split() if len(word) > 3]
131
- title_matches = sum(1 for word in title_words if word in resume_summary.lower())
132
- title_match = title_matches / len(title_words) if title_words else 0
133
-
134
- skill_score = min(2, skill_match_percentage * 3)
135
- exp_score = min(2, exp_match_ratio * 2)
136
- title_score = min(2, title_match * 2)
137
-
138
- name_match = re.search(r'Name:\s*(.*?)(?=\n|\Z)', resume_summary)
139
- name = name_match.group(1).strip() if name_match else "The candidate"
140
-
141
- age_match = re.search(r'Age:\s*(.*?)(?=\n|\Z)', resume_summary)
142
- age = age_match.group(1).strip() if age_match else "unspecified age"
143
-
144
- industry_match = re.search(r'Expected Industry:\s*(.*?)(?=\n|\Z)', resume_summary)
145
- industry = industry_match.group(1).strip() if industry_match else "unspecified industry"
146
-
147
- weighted_score = (skill_score * 0.5) + (exp_score * 0.3) + (title_score * 0.2)
148
-
149
- if weighted_score >= 1.5:
150
- fit_score = 2 # Good fit
151
- elif weighted_score >= 0.8:
152
- fit_score = 1 # Potential fit
153
- else:
154
- fit_score = 0 # Not a fit
155
-
156
- missing_skills = [skill for skill in required_skills if skill not in skills_mentioned]
157
-
158
- if fit_score == 2:
159
- fit_assessment = f"{fit_score}: GOOD FIT - {name} demonstrates strong alignment with the {job_title} position. Their background in {industry} and professional experience appear well-suited for this role's requirements. The technical expertise matches what the position demands."
160
- elif fit_score == 1:
161
- fit_assessment = f"{fit_score}: POTENTIAL FIT - {name} shows potential for the {job_title} role with some relevant experience, though there are gaps in certain technical areas. Their {industry} background provides partial alignment with the position requirements. Additional training might be needed in {', '.join(missing_skills[:2])} if pursuing this opportunity."
162
- else:
163
- fit_assessment = f"{fit_score}: NO FIT - {name}'s current background shows limited alignment with this {job_title} position. Their experience level and technical background differ significantly from the role requirements. A position better matching their {industry} expertise might be more suitable."
164
-
165
- execution_time = time.time() - start_time
166
-
167
- return fit_assessment, fit_score, execution_time
168
 
169
  #####################################
170
- # File & Information Extraction
171
  #####################################
172
- @st.cache_data(show_spinner=False)
173
  def extract_text_from_file(file_obj):
174
- filename = file_obj.name
175
- ext = os.path.splitext(filename)[1].lower()
176
 
177
  if ext == ".docx":
178
  try:
179
  document = docx.Document(file_obj)
180
- return "\n".join(para.text for para in document.paragraphs if para.text.strip())
181
  except Exception as e:
182
  return f"Error processing DOCX file: {e}"
183
  elif ext == ".doc":
184
  try:
185
  with tempfile.NamedTemporaryFile(delete=False, suffix='.doc') as temp_file:
186
  temp_file.write(file_obj.getvalue())
187
- temp_path = temp_file.name
188
-
189
- try:
190
- text = docx2txt.process(temp_path)
191
- except Exception:
192
- text = "Could not process .doc file. Please convert to .docx format."
193
-
194
- os.unlink(temp_path)
195
- return text
196
  except Exception as e:
197
  return f"Error processing DOC file: {e}"
198
  elif ext == ".txt":
199
  try:
200
- return file_obj.getvalue().decode("utf-8")
201
  except Exception as e:
202
  return f"Error processing TXT file: {e}"
203
  else:
204
  return "Unsupported file type. Please upload a .docx, .doc, or .txt file."
205
 
206
- # Simplified information extraction functions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  @lru_cache(maxsize=32)
208
  def extract_name(text_start):
209
- lines = [line.strip() for line in text_start.split('\n') if line.strip()][:5]
210
 
211
  if lines:
212
  first_line = lines[0]
213
- if 5 <= len(first_line) <= 40 and not any(x in first_line.lower() for x in ["resume", "cv", "curriculum", "vitae", "profile"]):
214
  return first_line
215
 
216
  for line in lines[:3]:
217
  if len(line.split()) <= 4 and not any(x in line.lower() for x in ["address", "phone", "email", "resume", "cv"]):
218
  return line
219
-
220
- return "Unknown (please extract from resume)"
221
 
222
  def extract_age(text):
223
- age_patterns = [r'age:?\s*(\d{1,2})', r'(\d{1,2})\s*years\s*old', r'dob:.*(\d{4})', r'date of birth:.*(\d{4})']
224
-
225
- for pattern in age_patterns:
226
  match = re.search(pattern, text.lower())
227
  if match:
228
- if len(match.group(1)) == 4: # Year of birth
229
- try:
230
- return str(2025 - int(match.group(1)))
231
- except:
232
- pass
233
  return match.group(1)
234
-
235
  return "Not specified"
236
 
237
  def extract_industry(text):
238
  industries = {
239
- "Technology": ["software", "programming", "developer", "IT", "tech"],
240
- "Finance": ["banking", "financial", "accounting", "finance"],
241
- "Healthcare": ["medical", "health", "hospital", "clinical"],
242
- "Education": ["teaching", "education", "university", "school"],
243
- "Marketing": ["marketing", "advertising", "digital marketing"],
244
- "Engineering": ["engineer", "engineering", "mechanical"],
245
- "Data Science": ["data science", "machine learning", "AI"],
246
- "Management": ["manager", "management", "leadership"]
247
  }
248
 
249
  text_lower = text.lower()
250
- industry_counts = {industry: sum(text_lower.count(kw.lower()) for kw in keywords)
251
- for industry, keywords in industries.items()}
252
-
253
- if industry_counts:
254
- return max(industry_counts.items(), key=lambda x: x[1])[0]
255
-
256
- return "Not clearly specified"
257
 
258
  def extract_job_position(text):
259
- objective_patterns = [
260
- r'objective:?\s*(.*?)(?=\n\n|\n\w+:|\Z)',
261
- r'career\s*objective:?\s*(.*?)(?=\n\n|\n\w+:|\Z)'
262
- ]
263
-
264
- for pattern in objective_patterns:
265
- match = re.search(pattern, text.lower(), re.IGNORECASE | re.DOTALL)
266
  if match:
267
- objective_text = match.group(1).strip()
268
- job_titles = ["developer", "engineer", "analyst", "manager", "director", "specialist"]
269
-
270
- for title in job_titles:
271
- if title in objective_text:
272
- title_pattern = r'(?:a|an)?\s*(\w+\s+' + title + r'|\w+\s+\w+\s+' + title + r')'
273
- title_match = re.search(title_pattern, objective_text)
274
- if title_match:
275
- return title_match.group(1).strip().title()
276
- return title.title()
277
-
278
- if len(objective_text) > 10:
279
- words = objective_text.split()
280
- return " ".join(words[:10]).title() + "..." if len(words) > 10 else objective_text.title()
281
-
282
- return "Not explicitly stated"
283
-
284
- def extract_skills(text):
285
- tech_skills = [
286
- "Python", "Java", "JavaScript", "HTML", "CSS", "SQL", "React", "Angular",
287
- "Machine Learning", "Data Analysis", "AWS", "Docker", "Git", "Agile",
288
- "Communication", "Leadership", "Project Management"
289
- ]
290
 
291
- return [skill for skill in tech_skills if re.search(r'\b' + re.escape(skill.lower()) + r'\b', text.lower())]
292
 
 
 
 
293
  def summarize_resume_text(resume_text, models):
294
- start_time = time.time()
295
 
 
296
  name = extract_name(resume_text[:500])
297
  age = extract_age(resume_text)
298
  industry = extract_industry(resume_text)
299
  job_position = extract_job_position(resume_text)
300
  skills = extract_skills(resume_text)
301
 
 
302
  try:
303
  if has_pipeline and 'summarizer' in models:
304
- model_summary = models['summarizer'](resume_text[:2000], max_length=100, min_length=30, do_sample=False)[0]['summary_text']
305
  else:
306
  model_summary = summarize_text(resume_text, models, max_length=100)
307
- except Exception as e:
308
- st.warning(f"Error in resume summarization: {e}")
309
- model_summary = "Error generating summary. Please check the original resume."
310
 
311
- formatted_summary = f"Name: {name}\n\nAge: {age}\n\nExpected Industry: {industry}\n\n"
312
- formatted_summary += f"Expected Job Position: {job_position}\n\nSkills: {', '.join(skills)}\n\nSummary: {model_summary}"
 
313
 
314
- return formatted_summary, time.time() - start_time
315
 
316
  def extract_job_requirements(job_description, models):
317
  tech_skills = [
318
- "Python", "Java", "JavaScript", "SQL", "React", "Angular",
319
- "Machine Learning", "AWS", "Docker", "Git", "Agile",
320
- "Communication", "Leadership", "Project Management"
321
  ]
322
 
323
- clean_job_text = job_description.lower()
324
 
325
- # Extract title
326
- title_patterns = [r'^([^:.\n]+?)(position|role|job)', r'^([^:.\n]+?)\n']
327
  job_title = "Not specified"
328
-
329
- for pattern in title_patterns:
330
- match = re.search(pattern, clean_job_text, re.IGNORECASE)
331
- if match and 3 <= len(match.group(1).strip()) <= 50:
332
- job_title = match.group(1).strip().capitalize()
333
- break
334
-
335
- # Extract experience
336
- exp_match = re.search(r'(\d+)(?:\+)?\s*(?:years|yrs)(?:\s*of)?\s*(?:experience|exp)', clean_job_text)
337
- years_required = int(exp_match.group(1)) if exp_match else 0
 
 
 
 
 
 
 
338
 
339
  # Extract skills
340
- required_skills = [skill for skill in tech_skills if re.search(r'\b' + re.escape(skill.lower()) + r'\b', clean_job_text)]
341
 
342
- # If no skills found, extract common words
343
  if not required_skills:
344
- words = re.findall(r'\b\w{4,}\b', clean_job_text)
 
345
  word_counts = {}
346
- for word in words:
347
- if word not in ["with", "that", "this", "have", "from", "they", "will"]:
348
- word_counts[word] = word_counts.get(word, 0) + 1
349
-
350
- required_skills = [word.capitalize() for word, _ in sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:5]]
351
-
352
- job_summary = summarize_text(job_description, models, max_length=100)
353
 
354
  return {
355
  "title": job_title,
356
  "years_experience": years_required,
357
  "required_skills": required_skills,
358
- "summary": job_summary
359
  }
360
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
  def analyze_job_fit(resume_summary, job_description, models):
362
- start_time = time.time()
363
  job_requirements = extract_job_requirements(job_description, models)
364
- assessment, fit_score, execution_time = evaluate_job_fit(resume_summary, job_requirements, models)
365
- return assessment, fit_score, time.time() - start_time
366
-
367
- # Load models at startup
368
- models = load_models()
369
 
370
  #####################################
371
- # Main Function - Kept intact
372
  #####################################
373
  def main():
374
- """Main function for the Streamlit application"""
375
  st.title("Resume-Job Fit Analyzer")
376
- st.markdown(
377
- """
378
- Upload your resume file in **.docx**, **.doc**, or **.txt** format and enter a job description to see how well you match with the job requirements.
379
- """
380
- )
381
-
382
- # Resume upload
383
- uploaded_file = st.file_uploader("Upload your resume (.docx, .doc, or .txt)", type=["docx", "doc", "txt"])
384
 
385
- # Job description input
 
 
386
  job_description = st.text_area("Enter Job Description", height=200, placeholder="Paste the job description here...")
387
 
388
- # Process button with optimized flow
389
- if uploaded_file is not None and job_description and st.button("Analyze Job Fit"):
390
- # Create a placeholder for the progress bar
391
- progress_bar = st.progress(0)
392
- status_text = st.empty()
393
 
394
  # Step 1: Extract text
395
- status_text.text("Step 1/3: Extracting text from resume...")
396
  resume_text = extract_text_from_file(uploaded_file)
397
- progress_bar.progress(25)
398
 
399
  if resume_text.startswith("Error") or resume_text == "Unsupported file type. Please upload a .docx, .doc, or .txt file.":
400
  st.error(resume_text)
401
  else:
402
  # Step 2: Generate summary
403
- status_text.text("Step 2/3: Analyzing resume and generating summary...")
404
- summary, summarization_time = summarize_resume_text(resume_text, models)
405
- progress_bar.progress(50)
406
-
407
- # Display summary
408
  st.subheader("Your Resume Summary")
409
  st.markdown(summary)
410
 
411
- # Step 3: Generate job fit assessment
412
- status_text.text("Step 3/3: Evaluating job fit (this will take a moment)...")
413
- assessment, fit_score, assessment_time = analyze_job_fit(summary, job_description, models)
414
- progress_bar.progress(100)
 
415
 
416
- # Clear status messages
417
- status_text.empty()
418
-
419
- # Display job fit results
420
  st.subheader("Job Fit Assessment")
421
-
422
- # Display fit score with label
423
- fit_labels = {
424
- 0: "NO FIT",
425
- 1: "POTENTIAL FIT",
426
- 2: "GOOD FIT"
427
- }
428
-
429
- # Show the score prominently with appropriate coloring
430
- score_label = fit_labels[fit_score]
431
- score_colors = {0: "red", 1: "orange", 2: "green"}
432
- st.markdown(f"<h2 style='color: {score_colors[fit_score]};'>{score_label}</h2>", unsafe_allow_html=True)
433
-
434
- # Display assessment
435
  st.markdown(assessment)
436
-
437
- st.info(f"Analysis completed in {(summarization_time + assessment_time):.2f} seconds")
438
 
439
- # Add potential next steps based on the fit score
440
  st.subheader("Recommended Next Steps")
441
-
442
  if fit_score == 2:
443
  st.markdown("""
444
  - Apply for this position as you appear to be a good match
@@ -458,6 +389,5 @@ def main():
458
  - Consider similar roles with fewer experience requirements
459
  """)
460
 
461
- # Run the main function
462
  if __name__ == "__main__":
463
  main()
 
1
+ import os, io, re, time, tempfile
2
  import streamlit as st
3
+ import docx, docx2txt
 
 
 
 
4
  import pandas as pd
5
  from functools import lru_cache
6
 
7
+ # Handle imports
8
  try:
9
  from transformers import pipeline
10
  has_pipeline = True
 
12
  from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForSeq2SeqLM
13
  import torch
14
  has_pipeline = False
 
15
 
16
+ # Setup page
17
  st.set_page_config(page_title="Resume-Job Fit Analyzer", initial_sidebar_state="collapsed")
18
+ st.markdown("""<style>[data-testid="collapsedControl"],[data-testid="stSidebar"] {display: none;}</style>""", unsafe_allow_html=True)
19
 
20
  #####################################
21
  # Model Loading & Text Processing
22
  #####################################
23
+ @st.cache_resource
24
  def load_models():
25
+ with st.spinner("Loading AI models..."):
26
  models = {}
 
27
  # Load summarization model
28
  if has_pipeline:
29
+ models['summarizer'] = pipeline("summarization", model="Falconsai/text_summarization", max_length=100)
30
  else:
31
  try:
32
  models['summarizer_model'] = AutoModelForSeq2SeqLM.from_pretrained("Falconsai/text_summarization")
33
  models['summarizer_tokenizer'] = AutoTokenizer.from_pretrained("Falconsai/text_summarization")
34
  except Exception as e:
35
  st.error(f"Error loading summarization model: {e}")
36
+ models['summarizer_model'] = models['summarizer_tokenizer'] = None
 
37
 
38
+ # Load evaluation model
39
  if has_pipeline:
40
  models['evaluator'] = pipeline("sentiment-analysis", model="CR7CAD/RobertaFinetuned")
41
  else:
 
44
  models['evaluator_tokenizer'] = AutoTokenizer.from_pretrained("CR7CAD/RobertaFinetuned")
45
  except Exception as e:
46
  st.error(f"Error loading sentiment model: {e}")
47
+ models['evaluator_model'] = models['evaluator_tokenizer'] = None
 
 
48
  return models
49
 
50
  def summarize_text(text, models, max_length=100):
51
+ """Summarize text with fallbacks"""
52
+ input_text = text[:1024]
53
 
54
+ # Try pipeline
55
  if has_pipeline and 'summarizer' in models:
56
  try:
57
  return models['summarizer'](input_text)[0]['summary_text']
58
+ except: pass
 
59
 
60
+ # Try manual model
61
  if 'summarizer_model' in models and models['summarizer_model']:
62
  try:
63
  tokenizer = models['summarizer_tokenizer']
64
  model = models['summarizer_model']
65
  inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024)
66
+ summary_ids = model.generate(inputs.input_ids, max_length=max_length, min_length=30, num_beams=4)
67
  return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
68
+ except: pass
 
69
 
70
+ # Fallback - extract sentences
71
  sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
72
+ scored = [(1.0/(i+1), s) for i, s in enumerate(sentences) if len(s.split()) >= 4]
73
+ scored.sort(reverse=True)
74
+
75
+ result, length = [], 0
76
+ for _, sentence in scored:
77
+ if length + len(sentence.split()) <= max_length:
78
+ result.append(sentence)
79
+ length += len(sentence.split())
80
+
81
+ if result:
82
+ ordered = sorted([(sentences.index(s), s) for s in result])
83
+ return " ".join(s for _, s in ordered)
84
+ return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  #####################################
87
+ # File Processing & Information Extraction
88
  #####################################
89
+ @st.cache_data
90
  def extract_text_from_file(file_obj):
91
+ ext = os.path.splitext(file_obj.name)[1].lower()
 
92
 
93
  if ext == ".docx":
94
  try:
95
  document = docx.Document(file_obj)
96
+ return "\n".join(para.text for para in document.paragraphs if para.text.strip())[:15000]
97
  except Exception as e:
98
  return f"Error processing DOCX file: {e}"
99
  elif ext == ".doc":
100
  try:
101
  with tempfile.NamedTemporaryFile(delete=False, suffix='.doc') as temp_file:
102
  temp_file.write(file_obj.getvalue())
103
+ text = docx2txt.process(temp_file.name)
104
+ os.unlink(temp_file.name)
105
+ return text[:15000]
 
 
 
 
 
 
106
  except Exception as e:
107
  return f"Error processing DOC file: {e}"
108
  elif ext == ".txt":
109
  try:
110
+ return file_obj.getvalue().decode("utf-8")[:15000]
111
  except Exception as e:
112
  return f"Error processing TXT file: {e}"
113
  else:
114
  return "Unsupported file type. Please upload a .docx, .doc, or .txt file."
115
 
116
+ # Information extraction functions
117
+ def extract_skills(text):
118
+ """Extract skills from text"""
119
+ skill_keywords = {
120
+ "Programming": ["Python", "Java", "JavaScript", "HTML", "CSS", "SQL", "C++", "C#", "React", "Angular"],
121
+ "Data Science": ["Machine Learning", "Data Analysis", "Statistics", "TensorFlow", "PyTorch", "AI", "NLP"],
122
+ "Database": ["SQL", "MySQL", "MongoDB", "PostgreSQL", "Oracle", "Redis"],
123
+ "Web Dev": ["React", "Angular", "Node.js", "Frontend", "Backend", "Full-Stack", "REST API"],
124
+ "Software Dev": ["Agile", "Scrum", "Git", "DevOps", "Docker", "CI/CD", "Jenkins"],
125
+ "Cloud": ["AWS", "Azure", "Google Cloud", "Lambda", "S3", "EC2"],
126
+ "Business": ["Project Management", "Leadership", "Teamwork", "Agile", "Scrum"]
127
+ }
128
+
129
+ text_lower = text.lower()
130
+ return [skill for _, skills in skill_keywords.items() for skill in skills if skill.lower() in text_lower]
131
+
132
  @lru_cache(maxsize=32)
133
  def extract_name(text_start):
134
+ lines = [line.strip() for line in text_start.split('\n')[:5] if line.strip()]
135
 
136
  if lines:
137
  first_line = lines[0]
138
+ if 5 <= len(first_line) <= 40 and not any(x in first_line.lower() for x in ["resume", "cv", "curriculum", "vitae"]):
139
  return first_line
140
 
141
  for line in lines[:3]:
142
  if len(line.split()) <= 4 and not any(x in line.lower() for x in ["address", "phone", "email", "resume", "cv"]):
143
  return line
144
+ return "Unknown"
 
145
 
146
  def extract_age(text):
147
+ for pattern in [r'age:?\s*(\d{1,2})', r'(\d{1,2})\s*years\s*old', r'dob:.*(\d{4})', r'date of birth:.*(\d{4})']:
 
 
148
  match = re.search(pattern, text.lower())
149
  if match:
150
+ if len(match.group(1)) == 4: # Birth year
151
+ try: return str(2025 - int(match.group(1)))
152
+ except: pass
 
 
153
  return match.group(1)
 
154
  return "Not specified"
155
 
156
  def extract_industry(text):
157
  industries = {
158
+ "Technology": ["software", "programming", "developer", "IT", "tech", "computer", "digital"],
159
+ "Finance": ["banking", "financial", "accounting", "finance", "analyst"],
160
+ "Healthcare": ["medical", "health", "hospital", "clinical", "nurse", "doctor"],
161
+ "Education": ["teaching", "teacher", "professor", "education", "university", "school"],
162
+ "Marketing": ["marketing", "advertising", "digital marketing", "social media", "brand"],
163
+ "Engineering": ["engineer", "engineering", "mechanical", "civil", "electrical"],
164
+ "Data Science": ["data science", "machine learning", "AI", "analytics", "big data"],
165
+ "Management": ["manager", "management", "leadership", "executive", "director"]
166
  }
167
 
168
  text_lower = text.lower()
169
+ counts = {ind: sum(text_lower.count(kw) for kw in kws) for ind, kws in industries.items()}
170
+ return max(counts.items(), key=lambda x: x[1])[0] if any(counts.values()) else "Not specified"
 
 
 
 
 
171
 
172
  def extract_job_position(text):
173
+ text_lower = text.lower()
174
+ for pattern in [r'objective:?\s*(.*?)(?=\n\n|\n\w+:|\Z)', r'career\s*objective:?\s*(.*?)(?=\n\n|\n\w+:|\Z)',
175
+ r'summary:?\s*(.*?)(?=\n\n|\n\w+:|\Z)', r'seeking.*position.*as\s*([^.]*)']:
176
+ match = re.search(pattern, text_lower, re.IGNORECASE | re.DOTALL)
 
 
 
177
  if match:
178
+ text = match.group(1).strip()
179
+ for title in ["developer", "engineer", "analyst", "manager", "specialist", "designer"]:
180
+ if title in text:
181
+ return next((m.group(1).strip().title() for m in
182
+ [re.search(r'(\w+\s+' + title + r')', text)] if m), title.title())
183
+ return " ".join(text.split()[:10]).title() + "..." if len(text.split()) > 10 else text.title()
184
+
185
+ # Check for job title near experience
186
+ for pattern in [r'experience:.*?(\w+\s+\w+(?:\s+\w+)?)(?=\s*at|\s*\()', r'(\w+\s+\w+(?:\s+\w+)?)\s*\(\s*(?:current|present)']:
187
+ match = re.search(pattern, text_lower, re.IGNORECASE)
188
+ if match: return match.group(1).strip().title()
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
+ return "Not specified"
191
 
192
+ #####################################
193
+ # Core Analysis Functions
194
+ #####################################
195
  def summarize_resume_text(resume_text, models):
196
+ start = time.time()
197
 
198
+ # Basic info extraction
199
  name = extract_name(resume_text[:500])
200
  age = extract_age(resume_text)
201
  industry = extract_industry(resume_text)
202
  job_position = extract_job_position(resume_text)
203
  skills = extract_skills(resume_text)
204
 
205
+ # Generate summary
206
  try:
207
  if has_pipeline and 'summarizer' in models:
208
+ model_summary = models['summarizer'](resume_text[:2000], max_length=100, min_length=30)[0]['summary_text']
209
  else:
210
  model_summary = summarize_text(resume_text, models, max_length=100)
211
+ except:
212
+ model_summary = "Error generating summary."
 
213
 
214
+ # Format result
215
+ summary = f"Name: {name}\n\nAge: {age}\n\nExpected Industry: {industry}\n\n"
216
+ summary += f"Expected Job Position: {job_position}\n\nSkills: {', '.join(skills)}\n\nSummary: {model_summary}"
217
 
218
+ return summary, time.time() - start
219
 
220
  def extract_job_requirements(job_description, models):
221
  tech_skills = [
222
+ "Python", "Java", "JavaScript", "SQL", "HTML", "CSS", "React", "Angular", "Machine Learning", "AWS",
223
+ "Azure", "Docker", "MySQL", "MongoDB", "Project Management", "Agile", "Leadership", "Git", "DevOps"
 
224
  ]
225
 
226
+ clean_text = job_description.lower()
227
 
228
+ # Extract job title
 
229
  job_title = "Not specified"
230
+ for pattern in [r'^([^:.\n]+?)(position|role|job)', r'^([^:.\n]+?)\n', r'hiring.*? ([^:.\n]+?)(:-|[.:]|\n|$)']:
231
+ match = re.search(pattern, clean_text, re.IGNORECASE)
232
+ if match:
233
+ title = match.group(1).strip() if len(match.groups()) >= 1 else match.group(2).strip()
234
+ if 3 <= len(title) <= 50:
235
+ job_title = title.capitalize()
236
+ break
237
+
238
+ # Extract years required
239
+ years_required = 0
240
+ for pattern in [r'(\d+)(?:\+)?\s*(?:years|yrs).*?experience', r'experience.*?(\d+)(?:\+)?\s*(?:years|yrs)']:
241
+ match = re.search(pattern, clean_text, re.IGNORECASE)
242
+ if match:
243
+ try:
244
+ years_required = int(match.group(1))
245
+ break
246
+ except: pass
247
 
248
  # Extract skills
249
+ required_skills = [skill for skill in tech_skills if re.search(r'\b' + re.escape(skill.lower()) + r'\b', clean_text)]
250
 
251
+ # Fallback if no skills found
252
  if not required_skills:
253
+ words = [w for w in re.findall(r'\b\w{4,}\b', clean_text)
254
+ if w not in ["with", "that", "this", "have", "from", "they", "will", "what", "your"]]
255
  word_counts = {}
256
+ for w in words: word_counts[w] = word_counts.get(w, 0) + 1
257
+ required_skills = [w.capitalize() for w, _ in sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:5]]
 
 
 
 
 
258
 
259
  return {
260
  "title": job_title,
261
  "years_experience": years_required,
262
  "required_skills": required_skills,
263
+ "summary": summarize_text(job_description, models, max_length=100)
264
  }
265
 
266
+ def evaluate_job_fit(resume_summary, job_requirements, models):
267
+ start = time.time()
268
+
269
+ # Basic extraction
270
+ required_skills = job_requirements["required_skills"]
271
+ years_required = job_requirements["years_experience"]
272
+ job_title = job_requirements["title"]
273
+ skills_mentioned = extract_skills(resume_summary)
274
+
275
+ # Calculate matches
276
+ matching_skills = [skill for skill in required_skills if skill in skills_mentioned]
277
+ skill_match = len(matching_skills) / len(required_skills) if required_skills else 0
278
+
279
+ # Extract experience
280
+ years_experience = 0
281
+ exp_match = re.search(r'(\d+)\+?\s*years?\s*(?:of)?\s*experience', resume_summary, re.IGNORECASE)
282
+ if exp_match:
283
+ try: years_experience = int(exp_match.group(1))
284
+ except: pass
285
+
286
+ # Calculate scores
287
+ exp_match_ratio = min(1.0, years_experience / max(1, years_required)) if years_required > 0 else 0.5
288
+ title_words = [w for w in job_title.lower().split() if len(w) > 3]
289
+ title_match = sum(1 for w in title_words if w in resume_summary.lower()) / len(title_words) if title_words else 0
290
+
291
+ # Final scores
292
+ skill_score = min(2, skill_match * 3)
293
+ exp_score = min(2, exp_match_ratio * 2)
294
+ title_score = min(2, title_match * 2)
295
+
296
+ # Extract candidate info
297
+ name = re.search(r'Name:\s*(.*?)(?=\n|\Z)', resume_summary)
298
+ name = name.group(1).strip() if name else "The candidate"
299
+
300
+ industry = re.search(r'Expected Industry:\s*(.*?)(?=\n|\Z)', resume_summary)
301
+ industry = industry.group(1).strip() if industry else "unspecified industry"
302
+
303
+ # Calculate weighted score
304
+ weighted_score = (skill_score * 0.5) + (exp_score * 0.3) + (title_score * 0.2)
305
+ fit_score = 2 if weighted_score >= 1.5 else (1 if weighted_score >= 0.8 else 0)
306
+
307
+ # Generate assessment
308
+ missing = [skill for skill in required_skills if skill not in skills_mentioned]
309
+
310
+ if fit_score == 2:
311
+ assessment = f"{fit_score}: GOOD FIT - {name} demonstrates strong alignment with the {job_title} position. Their background in {industry} appears well-suited for this role's requirements."
312
+ elif fit_score == 1:
313
+ assessment = f"{fit_score}: POTENTIAL FIT - {name} shows potential for the {job_title} role but has gaps in certain areas. Additional training might be needed in {', '.join(missing[:2])}."
314
+ else:
315
+ assessment = f"{fit_score}: NO FIT - {name}'s background shows limited alignment with this {job_title} position. Their experience and skills differ significantly from the requirements."
316
+
317
+ return assessment, fit_score, time.time() - start
318
+
319
  def analyze_job_fit(resume_summary, job_description, models):
320
+ start = time.time()
321
  job_requirements = extract_job_requirements(job_description, models)
322
+ assessment, fit_score, _ = evaluate_job_fit(resume_summary, job_requirements, models)
323
+ return assessment, fit_score, time.time() - start
 
 
 
324
 
325
  #####################################
326
+ # Main Function
327
  #####################################
328
  def main():
 
329
  st.title("Resume-Job Fit Analyzer")
330
+ st.markdown("Upload your resume file in **.docx**, **.doc**, or **.txt** format and enter a job description to see how well you match.")
 
 
 
 
 
 
 
331
 
332
+ # Load models and get inputs
333
+ models = load_models()
334
+ uploaded_file = st.file_uploader("Upload your resume", type=["docx", "doc", "txt"])
335
  job_description = st.text_area("Enter Job Description", height=200, placeholder="Paste the job description here...")
336
 
337
+ # Process when button clicked
338
+ if uploaded_file and job_description and st.button("Analyze Job Fit"):
339
+ progress = st.progress(0)
340
+ status = st.empty()
 
341
 
342
  # Step 1: Extract text
343
+ status.text("Step 1/3: Extracting text from resume...")
344
  resume_text = extract_text_from_file(uploaded_file)
345
+ progress.progress(25)
346
 
347
  if resume_text.startswith("Error") or resume_text == "Unsupported file type. Please upload a .docx, .doc, or .txt file.":
348
  st.error(resume_text)
349
  else:
350
  # Step 2: Generate summary
351
+ status.text("Step 2/3: Analyzing resume...")
352
+ summary, summary_time = summarize_resume_text(resume_text, models)
353
+ progress.progress(50)
 
 
354
  st.subheader("Your Resume Summary")
355
  st.markdown(summary)
356
 
357
+ # Step 3: Evaluate fit
358
+ status.text("Step 3/3: Evaluating job fit...")
359
+ assessment, fit_score, eval_time = analyze_job_fit(summary, job_description, models)
360
+ progress.progress(100)
361
+ status.empty()
362
 
363
+ # Display results
 
 
 
364
  st.subheader("Job Fit Assessment")
365
+ fit_labels = {0: "NOT FIT", 1: "POTENTIAL FIT", 2: "GOOD FIT"}
366
+ colors = {0: "red", 1: "orange", 2: "green"}
367
+ st.markdown(f"<h2 style='color: {colors[fit_score]};'>{fit_labels[fit_score]}</h2>", unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
368
  st.markdown(assessment)
369
+ st.info(f"Analysis completed in {(summary_time + eval_time):.2f} seconds")
 
370
 
371
+ # Recommendations
372
  st.subheader("Recommended Next Steps")
 
373
  if fit_score == 2:
374
  st.markdown("""
375
  - Apply for this position as you appear to be a good match
 
389
  - Consider similar roles with fewer experience requirements
390
  """)
391
 
 
392
  if __name__ == "__main__":
393
  main()