root commited on
Commit
1d59f84
Β·
1 Parent(s): a83946a
Files changed (1) hide show
  1. app.py +756 -323
app.py CHANGED
@@ -3,22 +3,22 @@ import pandas as pd
3
  import numpy as np
4
  import torch
5
  import nltk
6
- import faiss
7
  import os
8
  import tempfile
9
  import base64
10
- import re
11
- import io
12
  from rank_bm25 import BM25Okapi
13
- from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
14
- from nltk.tokenize import word_tokenize, sent_tokenize
15
- from tqdm import tqdm
16
  import pdfplumber
17
  import PyPDF2
18
  from docx import Document
19
  import csv
20
  from datasets import load_dataset
21
  import gc
 
 
 
 
22
 
23
  # Download NLTK resources
24
  try:
@@ -47,79 +47,93 @@ with st.sidebar:
47
  # Advanced options
48
  st.subheader("Advanced Options")
49
  top_k = st.number_input("Number of results to display", min_value=1, max_value=50, value=10, step=1)
50
- use_explanation = st.checkbox("Generate AI Explanations", value=True)
 
 
 
 
 
 
51
 
52
  st.markdown("---")
53
- st.markdown("### πŸ€– Models Used")
54
- st.markdown("- **Embedding**: NVIDIA NV-Embed-v2")
55
- st.markdown("- **Explanation**: Qwen3-14B (4-bit)")
56
- st.markdown("### πŸ“Š About")
57
- st.markdown("This app uses hybrid ranking combining semantic similarity with keyword matching to find the best candidates for job positions.")
 
 
 
 
 
 
 
58
 
59
  # Initialize session state
60
  if 'embedding_model' not in st.session_state:
61
  st.session_state.embedding_model = None
62
- if 'explanation_model' not in st.session_state:
63
- st.session_state.explanation_model = None
64
  if 'results' not in st.session_state:
65
  st.session_state.results = []
 
 
 
 
 
 
 
 
 
 
66
 
67
  @st.cache_resource
68
  def load_embedding_model():
69
- """Load and cache the embedding model"""
70
  try:
71
- with st.spinner("πŸ”„ Loading NVIDIA NV-Embed-v2 model..."):
72
- tokenizer = AutoTokenizer.from_pretrained("nvidia/NV-Embed-v2", trust_remote_code=True)
73
- model = AutoModel.from_pretrained(
74
- "nvidia/NV-Embed-v2",
75
- trust_remote_code=True,
76
- device_map="auto",
77
- torch_dtype=torch.float16
78
- )
79
  st.success("βœ… Embedding model loaded successfully!")
80
- return model, tokenizer
81
  except Exception as e:
82
  st.error(f"❌ Error loading embedding model: {str(e)}")
83
- return None, None
84
 
85
  @st.cache_resource
86
- def load_explanation_model():
87
- """Load and cache the explanation model with quantization"""
88
- if not use_explanation:
89
- return None, None
90
-
91
  try:
92
- with st.spinner("πŸ”„ Loading Qwen3-14B model with 4-bit quantization..."):
93
- # Configure 4-bit quantization
94
- quantization_config = BitsAndBytesConfig(
95
- load_in_4bit=True,
96
- bnb_4bit_quant_type="nf4",
97
- bnb_4bit_compute_dtype=torch.float16,
98
- bnb_4bit_use_double_quant=True
99
- )
100
-
101
- tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-14B-Instruct", trust_remote_code=True)
102
- model = AutoModelForCausalLM.from_pretrained(
103
- "Qwen/Qwen2.5-14B-Instruct",
104
- quantization_config=quantization_config,
105
- device_map="auto",
106
- trust_remote_code=True,
107
- torch_dtype=torch.float16
108
- )
109
- st.success("βœ… Explanation model loaded successfully!")
110
- return model, tokenizer
111
  except Exception as e:
112
- st.error(f"❌ Error loading explanation model: {str(e)}")
113
- return None, None
114
 
115
  class ResumeScreener:
116
  def __init__(self):
117
  # Load models
118
- self.embedding_model, self.embedding_tokenizer = load_embedding_model()
119
- if use_explanation:
120
- self.explanation_model, self.explanation_tokenizer = load_explanation_model()
121
- else:
122
- self.explanation_model, self.explanation_tokenizer = None, None
 
 
123
 
124
  def extract_text_from_file(self, file_path, file_type):
125
  """Extract text from various file types"""
@@ -158,48 +172,35 @@ class ResumeScreener:
158
  return ""
159
 
160
  def get_embedding(self, text):
161
- """Generate embedding for text"""
162
  if self.embedding_model is None:
163
- return np.zeros(4096) # NV-Embed-v2 dimension
 
164
 
165
  try:
166
- # Truncate text to avoid memory issues
167
- text = text[:8192] # Reasonable limit for NV-Embed-v2
168
-
169
- inputs = self.embedding_tokenizer(
170
- text,
171
- return_tensors="pt",
172
- truncation=True,
173
- max_length=512,
174
- padding=True
175
- )
176
-
177
- # Move to same device as model
178
- device = next(self.embedding_model.parameters()).device
179
- inputs = {k: v.to(device) for k, v in inputs.items()}
180
 
181
- with torch.no_grad():
182
- outputs = self.embedding_model(**inputs)
183
-
184
- # Extract embeddings - NV-Embed-v2 specific
185
- if hasattr(outputs, 'pooler_output'):
186
- embeddings = outputs.pooler_output
187
- elif hasattr(outputs, 'last_hidden_state'):
188
- embeddings = outputs.last_hidden_state.mean(dim=1)
189
- else:
190
- embeddings = outputs[0].mean(dim=1)
191
-
192
- return embeddings.cpu().numpy().squeeze()
193
 
 
 
 
 
 
 
194
  except Exception as e:
195
  st.error(f"Error generating embedding: {str(e)}")
196
- return np.zeros(4096)
197
 
198
  def calculate_bm25_scores(self, resume_texts, job_description):
199
  """Calculate BM25 scores for keyword matching"""
200
  try:
201
  job_tokens = word_tokenize(job_description.lower())
202
- corpus = [word_tokenize(text.lower()) for text in resume_texts if text.strip()]
203
 
204
  if not corpus:
205
  return [0.0] * len(resume_texts)
@@ -212,146 +213,382 @@ class ResumeScreener:
212
  st.error(f"Error calculating BM25 scores: {str(e)}")
213
  return [0.0] * len(resume_texts)
214
 
215
- def calculate_hybrid_scores(self, resume_texts, job_description):
216
- """Calculate hybrid scores combining semantic and keyword matching"""
217
- # Get job embedding
218
- job_embedding = self.get_embedding(job_description)
 
 
 
 
219
 
220
- # Get resume embeddings
221
- resume_embeddings = []
222
- progress_bar = st.progress(0)
223
- for i, text in enumerate(resume_texts):
224
- embedding = self.get_embedding(text)
225
- resume_embeddings.append(embedding)
226
- progress_bar.progress((i + 1) / len(resume_texts))
227
 
228
- # Calculate semantic scores (cosine similarity)
229
- semantic_scores = []
230
- for resume_emb in resume_embeddings:
231
- job_norm = job_embedding / (np.linalg.norm(job_embedding) + 1e-8)
232
- resume_norm = resume_emb / (np.linalg.norm(resume_emb) + 1e-8)
233
- similarity = np.dot(job_norm, resume_norm)
234
- semantic_scores.append(float(similarity))
235
 
236
- # Calculate BM25 scores
237
- bm25_scores = self.calculate_bm25_scores(resume_texts, job_description)
 
238
 
239
- # Normalize BM25 scores
240
- if bm25_scores and max(bm25_scores) > 0:
241
- max_bm25 = max(bm25_scores)
242
- bm25_scores = [score / max_bm25 for score in bm25_scores]
243
 
244
- # Calculate hybrid scores
245
- hybrid_scores = [
246
- (semantic_weight * sem_score) + (keyword_weight * bm25_score)
247
- for sem_score, bm25_score in zip(semantic_scores, bm25_scores)
248
- ]
249
-
250
- return hybrid_scores, semantic_scores, bm25_scores
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
  def extract_skills(self, text, job_description):
253
  """Extract skills from resume based on job description"""
254
- # Common tech skills and job-related terms
 
 
 
255
  common_skills = [
256
- "python", "java", "javascript", "react", "node.js", "sql", "html", "css",
257
- "aws", "azure", "docker", "kubernetes", "git", "agile", "scrum", "ci/cd",
258
- "machine learning", "data science", "artificial intelligence", "tensorflow",
259
- "pytorch", "pandas", "numpy", "scikit-learn", "mysql", "postgresql",
260
- "mongodb", "redis", "elasticsearch", "spark", "hadoop", "tableau", "powerbi"
 
 
 
 
261
  ]
262
 
263
- # Extract skills from job description
264
  job_words = set(word.lower() for word in word_tokenize(job_description) if len(word) > 2)
265
 
266
  # Find matching skills
267
  found_skills = []
268
  text_lower = text.lower()
269
 
270
- # Check common skills
271
  for skill in common_skills:
272
- if skill in text_lower and skill in " ".join(job_words):
273
  found_skills.append(skill)
274
 
275
- # Check job-specific terms
276
  for word in job_words:
277
- if len(word) > 3 and word in text_lower:
278
- found_skills.append(word)
 
 
279
 
280
- return list(set(found_skills))[:10] # Return top 10 unique skills
281
 
282
- def generate_explanation(self, resume_text, job_description, score, semantic_score, bm25_score, skills):
283
- """Generate explanation using Qwen model"""
284
- if self.explanation_model is None or self.explanation_tokenizer is None:
285
- return self._generate_simple_explanation(score, semantic_score, bm25_score, skills)
286
-
287
- try:
288
- # Create prompt
289
- prompt = f"""As a recruitment AI assistant, explain why this resume scored {score:.2f} for the given job position.
290
-
291
- Job Requirements:
292
- {job_description[:500]}...
293
-
294
- Resume Summary:
295
- {resume_text[:800]}...
296
-
297
- Scores:
298
- - Overall: {score:.2f}/1.0
299
- - Semantic Match: {semantic_score:.2f}/1.0
300
- - Keyword Match: {bm25_score:.2f}/1.0
301
- - Key Skills: {', '.join(skills[:5])}
302
-
303
- Provide a concise 2-3 sentence explanation of the match quality and key strengths."""
304
-
305
- # Generate response
306
- messages = [{"role": "user", "content": prompt}]
307
- text = self.explanation_tokenizer.apply_chat_template(
308
- messages, tokenize=False, add_generation_prompt=True
309
- )
310
-
311
- inputs = self.explanation_tokenizer(text, return_tensors="pt").to(self.explanation_model.device)
312
-
313
- with torch.no_grad():
314
- outputs = self.explanation_model.generate(
315
- **inputs,
316
- max_new_tokens=150,
317
- temperature=0.7,
318
- do_sample=True,
319
- pad_token_id=self.explanation_tokenizer.eos_token_id
320
- )
321
-
322
- response = self.explanation_tokenizer.decode(
323
- outputs[0][inputs.input_ids.shape[1]:],
324
- skip_special_tokens=True
325
- )
326
-
327
- return response.strip()[:400] # Limit length
328
-
329
- except Exception as e:
330
- st.warning(f"AI explanation failed: {str(e)}")
331
- return self._generate_simple_explanation(score, semantic_score, bm25_score, skills)
332
-
333
- def _generate_simple_explanation(self, score, semantic_score, bm25_score, skills):
334
- """Fallback explanation generation"""
335
  if score > 0.8:
336
  quality = "excellent"
337
  elif score > 0.6:
338
- quality = "good"
339
  elif score > 0.4:
340
  quality = "moderate"
341
  else:
342
  quality = "limited"
343
 
344
- explanation = f"This resume shows {quality} alignment with the job requirements (score: {score:.2f}). "
345
 
346
  if semantic_score > bm25_score:
347
- explanation += f"Strong conceptual match ({semantic_score:.2f}) with relevant experience. "
348
  else:
349
- explanation += f"Good keyword coverage ({bm25_score:.2f}) of job requirements. "
350
 
351
  if skills:
352
- explanation += f"Key matching skills: {', '.join(skills[:3])}."
353
 
354
  return explanation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
 
356
  def create_download_link(df, filename="resume_screening_results.csv"):
357
  """Create download link for results"""
@@ -361,16 +598,22 @@ def create_download_link(df, filename="resume_screening_results.csv"):
361
 
362
  # Main App Interface
363
  st.title("🎯 AI-Powered Resume Screener")
364
- st.markdown("*Find the perfect candidates using advanced AI matching*")
365
  st.markdown("---")
366
 
367
  # Initialize screener
368
- if st.session_state.embedding_model is None:
369
- screener = ResumeScreener()
370
- st.session_state.embedding_model = screener.embedding_model
371
- st.session_state.explanation_model = screener.explanation_model
372
- else:
373
- screener = ResumeScreener()
 
 
 
 
 
 
374
 
375
  # Job Description Input
376
  st.header("πŸ“ Step 1: Enter Job Description")
@@ -383,14 +626,25 @@ job_description = st.text_area(
383
  # Resume Input Options
384
  st.header("πŸ“„ Step 2: Upload Resumes")
385
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
386
  input_method = st.radio(
387
  "Choose input method:",
388
  ["πŸ“ Upload Files", "πŸ—‚οΈ Load from CSV Dataset", "πŸ”— Load from Hugging Face Dataset"]
389
  )
390
 
391
- resume_texts = []
392
- file_names = []
393
-
394
  if input_method == "πŸ“ Upload Files":
395
  uploaded_files = st.file_uploader(
396
  "Upload resume files",
@@ -401,23 +655,26 @@ if input_method == "πŸ“ Upload Files":
401
 
402
  if uploaded_files:
403
  with st.spinner(f"πŸ”„ Processing {len(uploaded_files)} files..."):
 
 
 
404
  for file in uploaded_files:
405
  file_type = file.name.split('.')[-1].lower()
406
 
407
- # Save temporary file
408
  with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{file_type}') as tmp_file:
409
  tmp_file.write(file.getvalue())
410
  tmp_path = tmp_file.name
411
 
412
- # Extract text
413
  text = screener.extract_text_from_file(tmp_path, file_type)
414
  if text.strip():
415
  resume_texts.append(text)
416
  file_names.append(file.name)
417
 
418
- # Cleanup
419
  os.unlink(tmp_path)
420
-
 
 
 
421
  if resume_texts:
422
  st.success(f"βœ… Successfully processed {len(resume_texts)} resumes")
423
 
@@ -442,6 +699,9 @@ elif input_method == "πŸ—‚οΈ Load from CSV Dataset":
442
 
443
  if st.button("πŸš€ Process CSV Data"):
444
  with st.spinner("πŸ”„ Processing CSV data..."):
 
 
 
445
  for idx, row in df.iterrows():
446
  text = str(row[text_column])
447
  if text and text.strip() and text.lower() != 'nan':
@@ -451,6 +711,9 @@ elif input_method == "πŸ—‚οΈ Load from CSV Dataset":
451
  file_names.append(f"Resume_{idx}")
452
  else:
453
  file_names.append(str(row[name_column]))
 
 
 
454
 
455
  if resume_texts:
456
  st.success(f"βœ… Successfully loaded {len(resume_texts)} resumes from CSV")
@@ -459,7 +722,9 @@ elif input_method == "πŸ—‚οΈ Load from CSV Dataset":
459
  st.error(f"❌ Error processing CSV: {str(e)}")
460
 
461
  elif input_method == "πŸ”— Load from Hugging Face Dataset":
462
- st.markdown("**Quick Load:** [Resume Atlas Dataset](https://huggingface.co/datasets/ahmedheakl/resume-atlas)")
 
 
463
 
464
  col1, col2 = st.columns([2, 1])
465
  with col1:
@@ -479,35 +744,35 @@ elif input_method == "πŸ”— Load from Hugging Face Dataset":
479
  st.success(f"βœ… Loaded dataset with {len(dataset)} entries")
480
  st.write("**Dataset Preview:**")
481
 
482
- # Show first few examples
483
  preview_df = pd.DataFrame(dataset[:5])
484
  st.dataframe(preview_df)
485
 
486
- # Column selection
487
  text_column = st.selectbox(
488
  "Select column with resume text:",
489
  dataset.column_names,
490
- index=0 if 'resume_text' in dataset.column_names else 0
491
  )
492
 
493
  category_column = None
494
  if 'category' in dataset.column_names:
 
495
  category_column = st.selectbox(
496
  "Filter by category (optional):",
497
- ["All"] + list(set(dataset['category']))
498
  )
499
 
500
  max_samples = st.slider("Maximum samples to load:", 10, min(1000, len(dataset)), 100)
501
 
502
  if st.button("πŸš€ Process Dataset"):
503
  with st.spinner("πŸ”„ Processing dataset..."):
 
 
 
504
  filtered_dataset = dataset
505
 
506
- # Apply category filter
507
  if category_column and category_column != "All":
508
  filtered_dataset = dataset.filter(lambda x: x['category'] == category_column)
509
 
510
- # Limit samples
511
  sample_indices = list(range(min(max_samples, len(filtered_dataset))))
512
 
513
  for idx in sample_indices:
@@ -517,11 +782,13 @@ elif input_method == "πŸ”— Load from Hugging Face Dataset":
517
  if text and text.strip() and text.lower() != 'nan':
518
  resume_texts.append(text)
519
 
520
- # Use ID or index for naming
521
  if 'id' in item:
522
  file_names.append(f"Resume_{item['id']}")
523
  else:
524
  file_names.append(f"Resume_{idx}")
 
 
 
525
 
526
  if resume_texts:
527
  st.success(f"βœ… Successfully loaded {len(resume_texts)} resumes")
@@ -530,142 +797,308 @@ elif input_method == "πŸ”— Load from Hugging Face Dataset":
530
  st.error(f"❌ Error loading dataset: {str(e)}")
531
 
532
  # Processing and Results
533
- if st.button("πŸ” Find Best Candidates", disabled=not (job_description and resume_texts)):
534
- if len(resume_texts) == 0:
535
- st.error("❌ Please upload resumes first!")
536
- elif not job_description.strip():
537
- st.error("❌ Please enter a job description!")
538
- else:
539
- with st.spinner("🧠 AI is analyzing resumes..."):
540
- # Calculate scores
541
- hybrid_scores, semantic_scores, bm25_scores = screener.calculate_hybrid_scores(
542
- resume_texts, job_description
543
- )
544
-
545
- # Prepare results
546
- results = []
547
- for i, (name, text, hybrid_score, semantic_score, bm25_score) in enumerate(
548
- zip(file_names, resume_texts, hybrid_scores, semantic_scores, bm25_scores)
549
- ):
550
- # Extract skills
551
- skills = screener.extract_skills(text, job_description)
552
-
553
- # Generate explanation
554
- explanation = ""
555
- if use_explanation:
556
- explanation = screener.generate_explanation(
557
- text, job_description, hybrid_score, semantic_score, bm25_score, skills
558
  )
559
-
560
- results.append({
561
- 'rank': i + 1,
562
- 'name': name,
563
- 'score': hybrid_score,
564
- 'semantic_score': semantic_score,
565
- 'keyword_score': bm25_score,
566
- 'skills': skills,
567
- 'explanation': explanation,
568
- 'text_preview': text[:300] + "..." if len(text) > 300 else text
569
- })
570
-
571
- # Sort by score
572
- results.sort(key=lambda x: x['score'], reverse=True)
573
-
574
- # Update ranks
575
- for i, result in enumerate(results):
576
- result['rank'] = i + 1
577
-
578
- # Store in session state
579
- st.session_state.results = results[:top_k]
580
-
581
- st.success(f"πŸŽ‰ Analysis complete! Found top {len(st.session_state.results)} candidates")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
582
 
583
  # Display Results
584
  if st.session_state.results:
585
  st.header("πŸ† Top Candidates")
586
 
587
- # Create summary dataframe
588
- summary_data = []
589
- for result in st.session_state.results:
590
- summary_data.append({
591
- "Rank": result['rank'],
592
- "Candidate": result['name'],
593
- "Overall Score": f"{result['score']:.3f}",
594
- "Semantic Score": f"{result['semantic_score']:.3f}",
595
- "Keyword Score": f"{result['keyword_score']:.3f}",
596
- "Key Skills": ", ".join(result['skills'][:3]) + ("..." if len(result['skills']) > 3 else ""),
597
- })
598
-
599
- summary_df = pd.DataFrame(summary_data)
600
- st.dataframe(summary_df, use_container_width=True)
601
-
602
- # Download link
603
- detailed_data = []
604
- for result in st.session_state.results:
605
- detailed_data.append({
606
- "Rank": result['rank'],
607
- "Candidate": result['name'],
608
- "Overall_Score": result['score'],
609
- "Semantic_Score": result['semantic_score'],
610
- "Keyword_Score": result['keyword_score'],
611
- "Skills": "; ".join(result['skills']),
612
- "Explanation": result['explanation'],
613
- "Resume_Preview": result['text_preview']
614
- })
615
 
616
- download_df = pd.DataFrame(detailed_data)
617
- st.markdown(create_download_link(download_df), unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
618
 
619
- # Detailed results
620
- st.subheader("πŸ“‹ Detailed Analysis")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
621
 
622
- for result in st.session_state.results:
623
- with st.expander(f"πŸ₯‡ #{result['rank']}: {result['name']} (Score: {result['score']:.3f})"):
624
- col1, col2 = st.columns([1, 2])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
625
 
626
  with col1:
627
- st.metric("Overall Score", f"{result['score']:.3f}")
628
- st.metric("Semantic Match", f"{result['semantic_score']:.3f}")
629
- st.metric("Keyword Match", f"{result['keyword_score']:.3f}")
 
 
 
 
630
 
631
- st.write("**🎯 Key Skills:**")
632
- for skill in result['skills'][:8]:
633
- st.write(f"β€’ {skill}")
 
 
634
 
635
  with col2:
636
- if result['explanation']:
637
- st.write("**πŸ€– AI Analysis:**")
638
- st.info(result['explanation'])
 
 
639
 
640
- st.write("**πŸ“„ Resume Preview:**")
641
- st.text_area("", result['text_preview'], height=150, disabled=True, key=f"preview_{result['rank']}")
642
-
643
- # Score visualization
644
- if len(st.session_state.results) > 1:
645
- st.subheader("πŸ“Š Score Visualization")
646
-
647
- chart_data = pd.DataFrame({
648
- 'Candidate': [r['name'] for r in st.session_state.results],
649
- 'Overall Score': [r['score'] for r in st.session_state.results],
650
- 'Semantic Score': [r['semantic_score'] for r in st.session_state.results],
651
- 'Keyword Score': [r['keyword_score'] for r in st.session_state.results]
652
- })
653
-
654
- st.bar_chart(chart_data.set_index('Candidate'))
655
 
656
  # Memory cleanup
657
- if st.button("🧹 Clear Memory"):
658
- if torch.cuda.is_available():
659
- torch.cuda.empty_cache()
660
- gc.collect()
661
- st.success("βœ… Memory cleared!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
662
 
663
  # Footer
664
  st.markdown("---")
665
  st.markdown(
666
  """
667
  <div style='text-align: center; color: #666;'>
668
- πŸš€ Powered by NVIDIA NV-Embed-v2 & Qwen3-14B | Built with Streamlit
669
  </div>
670
  """,
671
  unsafe_allow_html=True
 
3
  import numpy as np
4
  import torch
5
  import nltk
 
6
  import os
7
  import tempfile
8
  import base64
 
 
9
  from rank_bm25 import BM25Okapi
10
+ from sentence_transformers import SentenceTransformer, CrossEncoder
11
+ from nltk.tokenize import word_tokenize
 
12
  import pdfplumber
13
  import PyPDF2
14
  from docx import Document
15
  import csv
16
  from datasets import load_dataset
17
  import gc
18
+ from huggingface_hub import InferenceClient
19
+ import time
20
+ import faiss
21
+ import re
22
 
23
  # Download NLTK resources
24
  try:
 
47
  # Advanced options
48
  st.subheader("Advanced Options")
49
  top_k = st.number_input("Number of results to display", min_value=1, max_value=50, value=10, step=1)
50
+
51
+ # LLM Settings
52
+ st.subheader("LLM Settings")
53
+ use_llm_explanations = st.checkbox("Generate AI Explanations", value=True)
54
+ if use_llm_explanations:
55
+ hf_token = st.text_input("Hugging Face Token (optional)", type="password",
56
+ help="Enter your HF token for better rate limits")
57
 
58
  st.markdown("---")
59
+ st.markdown("### πŸ€– Advanced Pipeline")
60
+ st.markdown("- **Stage 1**: FAISS Recall (Top 50)")
61
+ st.markdown("- **Stage 2**: Cross-Encoder Re-ranking (Top 20)")
62
+ st.markdown("- **Stage 3**: BM25 Keyword Matching")
63
+ st.markdown("- **Stage 4**: LLM Intent Analysis")
64
+ st.markdown("- **Final**: Combined Scoring (Top 5)")
65
+ st.markdown("### πŸ“Š Models Used")
66
+ st.markdown("- **Embedding**: BAAI/bge-large-en-v1.5")
67
+ st.markdown("- **Cross-Encoder**: ms-marco-MiniLM-L6-v2")
68
+ st.markdown("- **LLM**: Qwen/Qwen3-14B")
69
+ st.markdown("### πŸ“ˆ Scoring Formula")
70
+ st.markdown("**Final Score = Cross-Encoder (0-1) + BM25 (0.1-0.2) + Intent (0-0.3)**")
71
 
72
  # Initialize session state
73
  if 'embedding_model' not in st.session_state:
74
  st.session_state.embedding_model = None
75
+ if 'cross_encoder' not in st.session_state:
76
+ st.session_state.cross_encoder = None
77
  if 'results' not in st.session_state:
78
  st.session_state.results = []
79
+ if 'resume_texts' not in st.session_state:
80
+ st.session_state.resume_texts = []
81
+ if 'file_names' not in st.session_state:
82
+ st.session_state.file_names = []
83
+ if 'llm_client' not in st.session_state:
84
+ st.session_state.llm_client = None
85
+ if 'explanations_generated' not in st.session_state:
86
+ st.session_state.explanations_generated = False
87
+ if 'current_job_description' not in st.session_state:
88
+ st.session_state.current_job_description = ""
89
 
90
  @st.cache_resource
91
  def load_embedding_model():
92
+ """Load and cache the BGE embedding model"""
93
  try:
94
+ with st.spinner("πŸ”„ Loading BAAI/bge-large-en-v1.5 model..."):
95
+ model = SentenceTransformer('BAAI/bge-large-en-v1.5')
 
 
 
 
 
 
96
  st.success("βœ… Embedding model loaded successfully!")
97
+ return model
98
  except Exception as e:
99
  st.error(f"❌ Error loading embedding model: {str(e)}")
100
+ return None
101
 
102
  @st.cache_resource
103
+ def load_cross_encoder():
104
+ """Load and cache the Cross-Encoder model"""
 
 
 
105
  try:
106
+ with st.spinner("πŸ”„ Loading Cross-Encoder ms-marco-MiniLM-L6-v2..."):
107
+ from sentence_transformers import CrossEncoder
108
+ model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L6-v2')
109
+ st.success("βœ… Cross-Encoder model loaded successfully!")
110
+ return model
111
+ except Exception as e:
112
+ st.error(f"❌ Error loading Cross-Encoder model: {str(e)}")
113
+ return None
114
+
115
+ def initialize_llm_client(hf_token=None):
116
+ """Initialize the LLM client for Qwen3-14B"""
117
+ try:
118
+ client = InferenceClient(
119
+ model="Qwen/Qwen3-14B",
120
+ token=hf_token if hf_token else None
121
+ )
122
+ return client
 
 
123
  except Exception as e:
124
+ st.error(f"❌ Error initializing LLM client: {str(e)}")
125
+ return None
126
 
127
  class ResumeScreener:
128
  def __init__(self):
129
  # Load models
130
+ self.embedding_model = load_embedding_model()
131
+ self.cross_encoder = load_cross_encoder()
132
+ self.llm_client = None
133
+
134
+ def set_llm_client(self, client):
135
+ """Set the LLM client"""
136
+ self.llm_client = client
137
 
138
  def extract_text_from_file(self, file_path, file_type):
139
  """Extract text from various file types"""
 
172
  return ""
173
 
174
  def get_embedding(self, text):
175
+ """Generate embedding for text using BGE model"""
176
  if self.embedding_model is None:
177
+ st.error("No embedding model loaded!")
178
+ return np.zeros(1024) # BGE-large dimension
179
 
180
  try:
181
+ # BGE models recommend adding instruction for retrieval
182
+ # For queries (job description)
183
+ if len(text) < 500: # Assuming shorter texts are queries
184
+ text = "Represent this sentence for searching relevant passages: " + text
 
 
 
 
 
 
 
 
 
 
185
 
186
+ # Truncate text to avoid memory issues
187
+ text = text[:8192] if text else ""
 
 
 
 
 
 
 
 
 
 
188
 
189
+ # Generate embedding
190
+ embedding = self.embedding_model.encode(text,
191
+ convert_to_numpy=True,
192
+ normalize_embeddings=True)
193
+ return embedding
194
+
195
  except Exception as e:
196
  st.error(f"Error generating embedding: {str(e)}")
197
+ return np.zeros(1024) # BGE-large dimension
198
 
199
  def calculate_bm25_scores(self, resume_texts, job_description):
200
  """Calculate BM25 scores for keyword matching"""
201
  try:
202
  job_tokens = word_tokenize(job_description.lower())
203
+ corpus = [word_tokenize(text.lower()) for text in resume_texts if text and text.strip()]
204
 
205
  if not corpus:
206
  return [0.0] * len(resume_texts)
 
213
  st.error(f"Error calculating BM25 scores: {str(e)}")
214
  return [0.0] * len(resume_texts)
215
 
216
+ def advanced_pipeline_ranking(self, resume_texts, job_description):
217
+ """Advanced pipeline: FAISS recall -> Cross-encoder -> BM25 -> LLM intent -> Final ranking"""
218
+ if not resume_texts:
219
+ return []
220
+
221
+ # Stage 1: FAISS Recall (Top 50)
222
+ st.write("πŸ” **Stage 1**: FAISS Recall - Finding top 50 candidates...")
223
+ top_50_indices = self.faiss_recall(resume_texts, job_description, top_k=50)
224
 
225
+ # Stage 2: Cross-Encoder Re-ranking (Top 20)
226
+ st.write("🎯 **Stage 2**: Cross-Encoder Re-ranking - Selecting top 20...")
227
+ top_20_results = self.cross_encoder_rerank(resume_texts, job_description, top_50_indices, top_k=20)
 
 
 
 
228
 
229
+ # Stage 3: BM25 Keyword Matching
230
+ st.write("πŸ”€ **Stage 3**: BM25 Keyword Matching...")
231
+ top_20_with_bm25 = self.add_bm25_scores(resume_texts, job_description, top_20_results)
 
 
 
 
232
 
233
+ # Stage 4: LLM Intent Analysis
234
+ st.write("πŸ€– **Stage 4**: LLM Intent Analysis...")
235
+ top_20_with_intent = self.add_intent_scores(resume_texts, job_description, top_20_with_bm25)
236
 
237
+ # Stage 5: Final Combined Ranking (Top 5)
238
+ st.write("πŸ† **Stage 5**: Final Combined Ranking...")
239
+ final_results = self.calculate_final_scores(top_20_with_intent)
 
240
 
241
+ return final_results[:5] # Return top 5
242
+
243
+ def faiss_recall(self, resume_texts, job_description, top_k=50):
244
+ """Stage 1: Use FAISS for initial recall to find top 50 resumes"""
245
+ try:
246
+ # Get job embedding
247
+ job_embedding = self.get_embedding(job_description)
248
+
249
+ # Get resume embeddings
250
+ resume_embeddings = []
251
+ progress_bar = st.progress(0)
252
+
253
+ for i, text in enumerate(resume_texts):
254
+ if text:
255
+ embedding = self.embedding_model.encode(text[:8192],
256
+ convert_to_numpy=True,
257
+ normalize_embeddings=True)
258
+ resume_embeddings.append(embedding)
259
+ else:
260
+ resume_embeddings.append(np.zeros(1024))
261
+ progress_bar.progress((i + 1) / len(resume_texts))
262
+
263
+ progress_bar.empty()
264
+
265
+ # Create FAISS index
266
+ resume_embeddings = np.array(resume_embeddings).astype('float32')
267
+ dimension = resume_embeddings.shape[1]
268
+ index = faiss.IndexFlatIP(dimension) # Inner product for cosine similarity
269
+ index.add(resume_embeddings)
270
+
271
+ # Search for top K
272
+ job_embedding = job_embedding.reshape(1, -1).astype('float32')
273
+ scores, indices = index.search(job_embedding, min(top_k, len(resume_texts)))
274
+
275
+ return indices[0].tolist()
276
+
277
+ except Exception as e:
278
+ st.error(f"Error in FAISS recall: {str(e)}")
279
+ # Fallback: return all indices
280
+ return list(range(min(top_k, len(resume_texts))))
281
+
282
+ def cross_encoder_rerank(self, resume_texts, job_description, top_50_indices, top_k=20):
283
+ """Stage 2: Use Cross-Encoder to re-rank top 50 and select top 20"""
284
+ try:
285
+ if not self.cross_encoder:
286
+ st.error("Cross-encoder not loaded!")
287
+ return [(idx, 0.0) for idx in top_50_indices[:top_k]]
288
+
289
+ # Prepare pairs for cross-encoder
290
+ pairs = []
291
+ valid_indices = []
292
+
293
+ for idx in top_50_indices:
294
+ if idx < len(resume_texts) and resume_texts[idx]:
295
+ # Truncate texts for cross-encoder
296
+ job_snippet = job_description[:512]
297
+ resume_snippet = resume_texts[idx][:512]
298
+ pairs.append([job_snippet, resume_snippet])
299
+ valid_indices.append(idx)
300
+
301
+ if not pairs:
302
+ return [(idx, 0.0) for idx in top_50_indices[:top_k]]
303
+
304
+ # Get cross-encoder scores
305
+ progress_bar = st.progress(0)
306
+ scores = []
307
+
308
+ # Process in batches to avoid memory issues
309
+ batch_size = 8
310
+ for i in range(0, len(pairs), batch_size):
311
+ batch = pairs[i:i+batch_size]
312
+ batch_scores = self.cross_encoder.predict(batch)
313
+ scores.extend(batch_scores)
314
+ progress_bar.progress(min(1.0, (i + batch_size) / len(pairs)))
315
+
316
+ progress_bar.empty()
317
+
318
+ # Combine indices with scores and sort
319
+ indexed_scores = list(zip(valid_indices, scores))
320
+ indexed_scores.sort(key=lambda x: x[1], reverse=True)
321
+
322
+ return indexed_scores[:top_k]
323
+
324
+ except Exception as e:
325
+ st.error(f"Error in cross-encoder re-ranking: {str(e)}")
326
+ return [(idx, 0.0) for idx in top_50_indices[:top_k]]
327
+
328
+ def add_bm25_scores(self, resume_texts, job_description, top_20_results):
329
+ """Stage 3: Add BM25 scores to top 20 resumes"""
330
+ try:
331
+ # Get texts for top 20
332
+ top_20_texts = [resume_texts[idx] for idx, _ in top_20_results]
333
+
334
+ # Calculate BM25 scores
335
+ bm25_scores = self.calculate_bm25_scores(top_20_texts, job_description)
336
+
337
+ # Normalize BM25 scores to 0.1-0.2 range
338
+ if bm25_scores and max(bm25_scores) > 0:
339
+ max_bm25 = max(bm25_scores)
340
+ min_bm25 = min(bm25_scores)
341
+ if max_bm25 > min_bm25:
342
+ normalized_bm25 = [
343
+ 0.1 + 0.1 * (score - min_bm25) / (max_bm25 - min_bm25)
344
+ for score in bm25_scores
345
+ ]
346
+ else:
347
+ normalized_bm25 = [0.15] * len(bm25_scores)
348
+ else:
349
+ normalized_bm25 = [0.15] * len(top_20_results)
350
+
351
+ # Combine with existing results
352
+ results_with_bm25 = []
353
+ for i, (idx, cross_score) in enumerate(top_20_results):
354
+ bm25_score = normalized_bm25[i] if i < len(normalized_bm25) else 0.15
355
+ results_with_bm25.append((idx, cross_score, bm25_score))
356
+
357
+ return results_with_bm25
358
+
359
+ except Exception as e:
360
+ st.error(f"Error adding BM25 scores: {str(e)}")
361
+ return [(idx, cross_score, 0.15) for idx, cross_score in top_20_results]
362
+
363
+ def add_intent_scores(self, resume_texts, job_description, top_20_with_bm25):
364
+ """Stage 4: Add LLM intent analysis scores"""
365
+ try:
366
+ if not self.llm_client:
367
+ st.warning("LLM client not available. Using default intent scores.")
368
+ return [(idx, cross_score, bm25_score, 0.1) for idx, cross_score, bm25_score in top_20_with_bm25]
369
+
370
+ results_with_intent = []
371
+ progress_bar = st.progress(0)
372
+
373
+ for i, (idx, cross_score, bm25_score) in enumerate(top_20_with_bm25):
374
+ intent_score = self.analyze_intent(resume_texts[idx], job_description)
375
+ results_with_intent.append((idx, cross_score, bm25_score, intent_score))
376
+ progress_bar.progress((i + 1) / len(top_20_with_bm25))
377
+
378
+ progress_bar.empty()
379
+ return results_with_intent
380
+
381
+ except Exception as e:
382
+ st.error(f"Error adding intent scores: {str(e)}")
383
+ return [(idx, cross_score, bm25_score, 0.1) for idx, cross_score, bm25_score in top_20_with_bm25]
384
+
385
+ def analyze_intent(self, resume_text, job_description):
386
+ """Analyze candidate's intent using LLM"""
387
+ try:
388
+ # Truncate texts
389
+ resume_snippet = resume_text[:1500] if len(resume_text) > 1500 else resume_text
390
+ job_snippet = job_description[:800] if len(job_description) > 800 else job_description
391
+
392
+ prompt = f"""You are given a job description and a candidate's resume.
393
+ Clearly answer: "Is the candidate likely seeking this job? Respond with 'Yes', 'Maybe', or 'No' and give a brief justification."
394
+
395
+ Job Description:
396
+ """
397
+ {job_snippet}
398
+ """
399
+
400
+ Candidate Resume:
401
+ """
402
+ {resume_snippet}
403
+ """
404
+
405
+ Response format:
406
+ Intent: [Yes/Maybe/No]
407
+ Reason: [Brief justification]"""
408
+
409
+ response = self.llm_client.text_generation(
410
+ prompt,
411
+ max_new_tokens=100,
412
+ temperature=0.3,
413
+ top_p=0.9,
414
+ do_sample=True
415
+ )
416
+
417
+ # Parse response
418
+ response_lower = response.lower()
419
+ if 'intent: yes' in response_lower or 'intent:yes' in response_lower:
420
+ return 0.3
421
+ elif 'intent: maybe' in response_lower or 'intent:maybe' in response_lower:
422
+ return 0.1
423
+ else:
424
+ return 0.0
425
+
426
+ except Exception as e:
427
+ st.warning(f"Error analyzing intent: {str(e)}")
428
+ return 0.1 # Default to "Maybe"
429
+
430
+ def calculate_final_scores(self, results_with_all_scores):
431
+ """Stage 5: Calculate final combined scores"""
432
+ try:
433
+ final_results = []
434
+
435
+ for idx, cross_score, bm25_score, intent_score in results_with_all_scores:
436
+ # Normalize cross-encoder score to 0-1 range
437
+ normalized_cross = max(0, min(1, cross_score))
438
+
439
+ # Final Score = Cross-Encoder (0-1) + BM25 (0.1-0.2) + Intent (0-0.3)
440
+ final_score = normalized_cross + bm25_score + intent_score
441
+
442
+ final_results.append({
443
+ 'index': idx,
444
+ 'cross_encoder_score': normalized_cross,
445
+ 'bm25_score': bm25_score,
446
+ 'intent_score': intent_score,
447
+ 'final_score': final_score
448
+ })
449
+
450
+ # Sort by final score
451
+ final_results.sort(key=lambda x: x['final_score'], reverse=True)
452
+
453
+ return final_results
454
+
455
+ except Exception as e:
456
+ st.error(f"Error calculating final scores: {str(e)}")
457
+ return []
458
 
459
  def extract_skills(self, text, job_description):
460
  """Extract skills from resume based on job description"""
461
+ if not text:
462
+ return []
463
+
464
+ # Common tech skills
465
  common_skills = [
466
+ "python", "java", "javascript", "react", "angular", "vue", "node.js",
467
+ "express", "django", "flask", "spring", "sql", "nosql", "html", "css",
468
+ "aws", "azure", "gcp", "docker", "kubernetes", "jenkins", "git", "github",
469
+ "agile", "scrum", "jira", "ci/cd", "devops", "microservices", "rest", "api",
470
+ "machine learning", "deep learning", "data science", "artificial intelligence",
471
+ "tensorflow", "pytorch", "keras", "scikit-learn", "pandas", "numpy",
472
+ "matplotlib", "seaborn", "jupyter", "r", "sas", "spss", "tableau", "powerbi",
473
+ "excel", "mysql", "postgresql", "mongodb", "redis", "elasticsearch",
474
+ "kafka", "rabbitmq", "spark", "hadoop", "hive", "airflow", "linux", "unix"
475
  ]
476
 
477
+ # Extract potential skills from job description
478
  job_words = set(word.lower() for word in word_tokenize(job_description) if len(word) > 2)
479
 
480
  # Find matching skills
481
  found_skills = []
482
  text_lower = text.lower()
483
 
484
+ # Check common skills that appear in both resume and job description
485
  for skill in common_skills:
486
+ if skill in text_lower and any(skill in job_word for job_word in job_words):
487
  found_skills.append(skill)
488
 
489
+ # Check for skills mentioned in job description
490
  for word in job_words:
491
+ if len(word) > 3 and word in text_lower and word not in found_skills:
492
+ # Basic filter to avoid common words
493
+ if word not in ['with', 'have', 'that', 'this', 'from', 'what', 'when', 'where']:
494
+ found_skills.append(word)
495
 
496
+ return list(set(found_skills))[:15] # Return top 15 unique skills
497
 
498
+ def generate_simple_explanation(self, score, semantic_score, bm25_score, skills):
499
+ """Generate simple explanation for the match (fallback)"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
500
  if score > 0.8:
501
  quality = "excellent"
502
  elif score > 0.6:
503
+ quality = "strong"
504
  elif score > 0.4:
505
  quality = "moderate"
506
  else:
507
  quality = "limited"
508
 
509
+ explanation = f"This candidate shows {quality} alignment with the position (score: {score:.2f}). "
510
 
511
  if semantic_score > bm25_score:
512
+ explanation += f"The resume demonstrates strong conceptual relevance ({semantic_score:.2f}) suggesting good experience fit. "
513
  else:
514
+ explanation += f"The resume has high keyword match ({bm25_score:.2f}) indicating direct skill alignment. "
515
 
516
  if skills:
517
+ explanation += f"Key matching competencies include: {', '.join(skills[:5])}."
518
 
519
  return explanation
520
+
521
+ def generate_llm_explanation(self, resume_text, job_description, score, skills, max_retries=3):
522
+ """Generate detailed explanation using Qwen3-14B"""
523
+ if not self.llm_client:
524
+ return self.generate_simple_explanation(score, score, score, skills)
525
+
526
+ # Truncate texts to manage token limits
527
+ resume_snippet = resume_text[:2000] if len(resume_text) > 2000 else resume_text
528
+ job_snippet = job_description[:1000] if len(job_description) > 1000 else job_description
529
+
530
+ prompt = f"""You are an expert HR analyst. Analyze this individual candidate's resume against the job requirements and write EXACTLY 150 words explaining why this specific candidate is suitable for the position.
531
+
532
+ Structure your 150-word analysis as follows:
533
+ 1. Experience alignment (40-50 words)
534
+ 2. Key strengths and skills match (40-50 words)
535
+ 3. Unique value proposition (40-50 words)
536
+ 4. Overall recommendation (10-20 words)
537
+
538
+ Job Requirements:
539
+ {job_snippet}
540
+
541
+ Candidate's Resume:
542
+ {resume_snippet}
543
+
544
+ Identified Matching Skills: {', '.join(skills[:10])}
545
+ Compatibility Score: {score:.1%}
546
+
547
+ Write a professional, detailed 150-word analysis for THIS INDIVIDUAL CANDIDATE:"""
548
+
549
+ for attempt in range(max_retries):
550
+ try:
551
+ response = self.llm_client.text_generation(
552
+ prompt,
553
+ max_new_tokens=200,
554
+ temperature=0.7,
555
+ top_p=0.9,
556
+ do_sample=True
557
+ )
558
+
559
+ # Extract the response and ensure it's about 150 words
560
+ explanation = response.strip()
561
+ word_count = len(explanation.split())
562
+
563
+ # If response is close to 150 words (130-170), accept it
564
+ if 130 <= word_count <= 170:
565
+ return explanation
566
+
567
+ # If response is too short or too long, try again with adjusted prompt
568
+ if word_count < 130:
569
+ # Response too short, try again
570
+ continue
571
+ elif word_count > 170:
572
+ # Response too long, truncate to approximately 150 words
573
+ words = explanation.split()
574
+ truncated = ' '.join(words[:150])
575
+ # Add proper ending if truncated
576
+ if not truncated.endswith('.'):
577
+ truncated += '.'
578
+ return truncated
579
+
580
+ return explanation
581
+
582
+ except Exception as e:
583
+ if attempt < max_retries - 1:
584
+ time.sleep(2) # Wait before retry
585
+ continue
586
+ else:
587
+ # Fallback to simple explanation
588
+ return self.generate_simple_explanation(score, score, score, skills)
589
+
590
+ # If all retries failed, use simple explanation
591
+ return self.generate_simple_explanation(score, score, score, skills)
592
 
593
  def create_download_link(df, filename="resume_screening_results.csv"):
594
  """Create download link for results"""
 
598
 
599
  # Main App Interface
600
  st.title("🎯 AI-Powered Resume Screener")
601
+ st.markdown("*Find the perfect candidates using BAAI/bge-large-en-v1.5 embeddings and Qwen3-14B explanations*")
602
  st.markdown("---")
603
 
604
  # Initialize screener
605
+ screener = ResumeScreener()
606
+
607
+ # Initialize LLM client if enabled
608
+ if use_llm_explanations:
609
+ if 'hf_token' in locals() and hf_token:
610
+ if st.session_state.llm_client is None:
611
+ st.session_state.llm_client = initialize_llm_client(hf_token)
612
+ else:
613
+ if st.session_state.llm_client is None:
614
+ st.session_state.llm_client = initialize_llm_client()
615
+
616
+ screener.set_llm_client(st.session_state.llm_client)
617
 
618
  # Job Description Input
619
  st.header("πŸ“ Step 1: Enter Job Description")
 
626
  # Resume Input Options
627
  st.header("πŸ“„ Step 2: Upload Resumes")
628
 
629
+ # Show loaded resumes indicator
630
+ if st.session_state.resume_texts:
631
+ col1, col2 = st.columns([3, 1])
632
+ with col1:
633
+ st.info(f"πŸ“š {len(st.session_state.resume_texts)} resumes loaded and ready for analysis")
634
+ with col2:
635
+ if st.button("πŸ—‘οΈ Clear Resumes", type="secondary", help="Clear all loaded resumes to start fresh"):
636
+ st.session_state.resume_texts = []
637
+ st.session_state.file_names = []
638
+ st.session_state.results = []
639
+ st.session_state.explanations_generated = False
640
+ st.session_state.current_job_description = ""
641
+ st.rerun()
642
+
643
  input_method = st.radio(
644
  "Choose input method:",
645
  ["πŸ“ Upload Files", "πŸ—‚οΈ Load from CSV Dataset", "πŸ”— Load from Hugging Face Dataset"]
646
  )
647
 
 
 
 
648
  if input_method == "πŸ“ Upload Files":
649
  uploaded_files = st.file_uploader(
650
  "Upload resume files",
 
655
 
656
  if uploaded_files:
657
  with st.spinner(f"πŸ”„ Processing {len(uploaded_files)} files..."):
658
+ resume_texts = []
659
+ file_names = []
660
+
661
  for file in uploaded_files:
662
  file_type = file.name.split('.')[-1].lower()
663
 
 
664
  with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{file_type}') as tmp_file:
665
  tmp_file.write(file.getvalue())
666
  tmp_path = tmp_file.name
667
 
 
668
  text = screener.extract_text_from_file(tmp_path, file_type)
669
  if text.strip():
670
  resume_texts.append(text)
671
  file_names.append(file.name)
672
 
 
673
  os.unlink(tmp_path)
674
+
675
+ st.session_state.resume_texts = resume_texts
676
+ st.session_state.file_names = file_names
677
+
678
  if resume_texts:
679
  st.success(f"βœ… Successfully processed {len(resume_texts)} resumes")
680
 
 
699
 
700
  if st.button("πŸš€ Process CSV Data"):
701
  with st.spinner("πŸ”„ Processing CSV data..."):
702
+ resume_texts = []
703
+ file_names = []
704
+
705
  for idx, row in df.iterrows():
706
  text = str(row[text_column])
707
  if text and text.strip() and text.lower() != 'nan':
 
711
  file_names.append(f"Resume_{idx}")
712
  else:
713
  file_names.append(str(row[name_column]))
714
+
715
+ st.session_state.resume_texts = resume_texts
716
+ st.session_state.file_names = file_names
717
 
718
  if resume_texts:
719
  st.success(f"βœ… Successfully loaded {len(resume_texts)} resumes from CSV")
 
722
  st.error(f"❌ Error processing CSV: {str(e)}")
723
 
724
  elif input_method == "πŸ”— Load from Hugging Face Dataset":
725
+ st.markdown("**Popular Resume Datasets:**")
726
+ st.markdown("- `ahmedheakl/resume-atlas`")
727
+ st.markdown("- `InferenceFly/Resume-Dataset`")
728
 
729
  col1, col2 = st.columns([2, 1])
730
  with col1:
 
744
  st.success(f"βœ… Loaded dataset with {len(dataset)} entries")
745
  st.write("**Dataset Preview:**")
746
 
 
747
  preview_df = pd.DataFrame(dataset[:5])
748
  st.dataframe(preview_df)
749
 
 
750
  text_column = st.selectbox(
751
  "Select column with resume text:",
752
  dataset.column_names,
753
+ index=dataset.column_names.index('resume_text') if 'resume_text' in dataset.column_names else 0
754
  )
755
 
756
  category_column = None
757
  if 'category' in dataset.column_names:
758
+ categories = list(set(dataset['category']))
759
  category_column = st.selectbox(
760
  "Filter by category (optional):",
761
+ ["All"] + categories
762
  )
763
 
764
  max_samples = st.slider("Maximum samples to load:", 10, min(1000, len(dataset)), 100)
765
 
766
  if st.button("πŸš€ Process Dataset"):
767
  with st.spinner("πŸ”„ Processing dataset..."):
768
+ resume_texts = []
769
+ file_names = []
770
+
771
  filtered_dataset = dataset
772
 
 
773
  if category_column and category_column != "All":
774
  filtered_dataset = dataset.filter(lambda x: x['category'] == category_column)
775
 
 
776
  sample_indices = list(range(min(max_samples, len(filtered_dataset))))
777
 
778
  for idx in sample_indices:
 
782
  if text and text.strip() and text.lower() != 'nan':
783
  resume_texts.append(text)
784
 
 
785
  if 'id' in item:
786
  file_names.append(f"Resume_{item['id']}")
787
  else:
788
  file_names.append(f"Resume_{idx}")
789
+
790
+ st.session_state.resume_texts = resume_texts
791
+ st.session_state.file_names = file_names
792
 
793
  if resume_texts:
794
  st.success(f"βœ… Successfully loaded {len(resume_texts)} resumes")
 
797
  st.error(f"❌ Error loading dataset: {str(e)}")
798
 
799
  # Processing and Results
800
+ st.header("πŸ” Step 3: Analyze Resumes")
801
+
802
+ # First button: Find top K candidates (fast ranking)
803
+ col1, col2 = st.columns([1, 1])
804
+
805
+ with col1:
806
+ if st.button("πŸš€ Advanced Pipeline Analysis",
807
+ disabled=not (job_description and st.session_state.resume_texts),
808
+ type="primary",
809
+ help="Run the complete 5-stage advanced pipeline"):
810
+ if len(st.session_state.resume_texts) == 0:
811
+ st.error("❌ Please upload resumes first!")
812
+ elif not job_description.strip():
813
+ st.error("❌ Please enter a job description!")
814
+ else:
815
+ with st.spinner("πŸš€ Running Advanced Pipeline Analysis..."):
816
+ try:
817
+ # Run the advanced pipeline
818
+ pipeline_results = screener.advanced_pipeline_ranking(
819
+ st.session_state.resume_texts, job_description
 
 
 
 
 
820
  )
821
+
822
+ # Prepare results for display
823
+ results = []
824
+
825
+ for rank, result_data in enumerate(pipeline_results, 1):
826
+ idx = result_data['index']
827
+ name = st.session_state.file_names[idx]
828
+ text = st.session_state.resume_texts[idx]
829
+
830
+ # Extract skills
831
+ skills = screener.extract_skills(text, job_description)
832
+
833
+ results.append({
834
+ 'rank': rank,
835
+ 'name': name,
836
+ 'final_score': result_data['final_score'],
837
+ 'cross_encoder_score': result_data['cross_encoder_score'],
838
+ 'bm25_score': result_data['bm25_score'],
839
+ 'intent_score': result_data['intent_score'],
840
+ 'skills': skills,
841
+ 'text': text,
842
+ 'text_preview': text[:500] + "..." if len(text) > 500 else text,
843
+ 'explanation': None # No detailed explanation yet
844
+ })
845
+
846
+ # Add simple explanations for now
847
+ for result in results:
848
+ result['explanation'] = screener.generate_simple_explanation(
849
+ result['final_score'],
850
+ result['cross_encoder_score'],
851
+ result['bm25_score'],
852
+ result['skills']
853
+ )
854
+
855
+ # Store in session state
856
+ st.session_state.results = results
857
+ st.session_state.explanations_generated = False
858
+ st.session_state.current_job_description = job_description
859
+
860
+ st.success(f"πŸš€ Advanced pipeline complete! Found top {len(st.session_state.results)} candidates.")
861
+
862
+ except Exception as e:
863
+ st.error(f"❌ Error during analysis: {str(e)}")
864
+
865
+ # Second button: Generate AI explanations (slower, optional)
866
+ with col2:
867
+ # Show this button only if we have results and LLM is enabled
868
+ show_explanation_button = (
869
+ st.session_state.results and
870
+ use_llm_explanations and
871
+ st.session_state.llm_client and
872
+ not st.session_state.explanations_generated
873
+ )
874
+
875
+ if show_explanation_button:
876
+ if st.button("πŸ€– Generate AI Explanations",
877
+ type="secondary",
878
+ help="Generate detailed 150-word explanations using Qwen3-14B (takes longer)"):
879
+ with st.spinner("πŸ€– Generating detailed AI explanations..."):
880
+ try:
881
+ explanation_progress = st.progress(0)
882
+ explanation_text = st.empty()
883
+
884
+ for i, result in enumerate(st.session_state.results):
885
+ explanation_text.text(f"πŸ€– Generating AI explanation for candidate {i+1}/{len(st.session_state.results)}...")
886
+
887
+ llm_explanation = screener.generate_llm_explanation(
888
+ result['text'],
889
+ st.session_state.current_job_description,
890
+ result['final_score'],
891
+ result['skills']
892
+ )
893
+ result['explanation'] = llm_explanation
894
+
895
+ explanation_progress.progress((i + 1) / len(st.session_state.results))
896
+
897
+ explanation_progress.empty()
898
+ explanation_text.empty()
899
+
900
+ # Mark explanations as generated
901
+ st.session_state.explanations_generated = True
902
+
903
+ st.success(f"πŸ€– AI explanations generated for all {len(st.session_state.results)} candidates!")
904
+
905
+ except Exception as e:
906
+ st.error(f"❌ Error generating explanations: {str(e)}")
907
+
908
+ elif st.session_state.results and st.session_state.explanations_generated:
909
+ st.info("βœ… AI explanations already generated!")
910
+
911
+ elif st.session_state.results and not use_llm_explanations:
912
+ st.info("πŸ’‘ Enable 'Generate AI Explanations' in sidebar to use this feature")
913
+
914
+ elif st.session_state.results and not st.session_state.llm_client:
915
+ st.warning("⚠️ LLM client not available. Check your Hugging Face token.")
916
 
917
  # Display Results
918
  if st.session_state.results:
919
  st.header("πŸ† Top Candidates")
920
 
921
+ # Create tabs for different views
922
+ tab1, tab2, tab3 = st.tabs(["πŸ“Š Summary", "πŸ“‹ Detailed Analysis", "πŸ“ˆ Visualizations"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
923
 
924
+ with tab1:
925
+ # Create summary dataframe with new scoring system
926
+ summary_data = []
927
+ for result in st.session_state.results:
928
+ # Map intent score to text
929
+ intent_text = "Yes" if result['intent_score'] == 0.3 else "Maybe" if result['intent_score'] == 0.1 else "No"
930
+
931
+ summary_data.append({
932
+ "Rank": result['rank'],
933
+ "Candidate": result['name'],
934
+ "Final Score": f"{result['final_score']:.2f}",
935
+ "Cross-Encoder": f"{result['cross_encoder_score']:.2f}",
936
+ "BM25": f"{result['bm25_score']:.2f}",
937
+ "Intent": f"{intent_text} ({result['intent_score']:.1f})",
938
+ "Top Skills": ", ".join(result['skills'][:5])
939
+ })
940
+
941
+ summary_df = pd.DataFrame(summary_data)
942
+
943
+ # Style the dataframe
944
+ def color_scores(val):
945
+ if isinstance(val, str) and any(char.isdigit() for char in val):
946
+ try:
947
+ # Extract numeric value
948
+ numeric_val = float(''.join(c for c in val if c.isdigit() or c == '.'))
949
+ if 'Final Score' in val or numeric_val >= 1.0:
950
+ if numeric_val >= 1.2:
951
+ return 'background-color: #d4edda'
952
+ elif numeric_val >= 1.0:
953
+ return 'background-color: #fff3cd'
954
+ else:
955
+ return 'background-color: #f8d7da'
956
+ else:
957
+ if numeric_val >= 0.7:
958
+ return 'background-color: #d4edda'
959
+ elif numeric_val >= 0.5:
960
+ return 'background-color: #fff3cd'
961
+ else:
962
+ return 'background-color: #f8d7da'
963
+ except:
964
+ pass
965
+ return ''
966
+
967
+ styled_df = summary_df.style.applymap(color_scores, subset=['Final Score', 'Cross-Encoder', 'BM25'])
968
+ st.dataframe(styled_df, use_container_width=True)
969
+
970
+ # Download link
971
+ detailed_data = []
972
+ for result in st.session_state.results:
973
+ intent_text = "Yes" if result['intent_score'] == 0.3 else "Maybe" if result['intent_score'] == 0.1 else "No"
974
+
975
+ detailed_data.append({
976
+ "Rank": result['rank'],
977
+ "Candidate": result['name'],
978
+ "Final_Score": result['final_score'],
979
+ "Cross_Encoder_Score": result['cross_encoder_score'],
980
+ "BM25_Score": result['bm25_score'],
981
+ "Intent_Score": result['intent_score'],
982
+ "Intent_Analysis": intent_text,
983
+ "Skills": "; ".join(result['skills']),
984
+ "AI_Explanation": result['explanation'],
985
+ "Resume_Preview": result['text_preview']
986
+ })
987
+
988
+ download_df = pd.DataFrame(detailed_data)
989
+ st.markdown(create_download_link(download_df), unsafe_allow_html=True)
990
 
991
+ with tab2:
992
+ # Detailed results with new scoring breakdown
993
+ for result in st.session_state.results:
994
+ intent_text = "Yes" if result['intent_score'] == 0.3 else "Maybe" if result['intent_score'] == 0.1 else "No"
995
+
996
+ with st.expander(f"#{result['rank']}: {result['name']} (Final Score: {result['final_score']:.2f})"):
997
+ col1, col2 = st.columns([1, 2])
998
+
999
+ with col1:
1000
+ st.metric("πŸ† Final Score", f"{result['final_score']:.2f}")
1001
+
1002
+ st.write("**πŸ“Š Score Breakdown:**")
1003
+ st.metric("🎯 Cross-Encoder", f"{result['cross_encoder_score']:.2f}", help="Semantic relevance (0-1)")
1004
+ st.metric("πŸ”€ BM25 Keywords", f"{result['bm25_score']:.2f}", help="Keyword matching (0.1-0.2)")
1005
+ st.metric("πŸ€– Intent Analysis", f"{intent_text} ({result['intent_score']:.1f})", help="Job seeking likelihood (0-0.3)")
1006
+
1007
+ st.write("**🎯 Matching Skills:**")
1008
+ skills_per_column = 5
1009
+ skill_cols = st.columns(2)
1010
+ for idx, skill in enumerate(result['skills'][:10]):
1011
+ with skill_cols[idx % 2]:
1012
+ st.write(f"β€’ {skill}")
1013
+
1014
+ with col2:
1015
+ st.write("**πŸ’‘ AI-Generated Match Analysis:**")
1016
+ st.info(result['explanation'])
1017
+
1018
+ st.write("**πŸ“„ Resume Preview:**")
1019
+ st.text_area("", result['text_preview'], height=200, disabled=True, key=f"preview_{result['rank']}")
1020
 
1021
+ with tab3:
1022
+ # Score visualization
1023
+ if len(st.session_state.results) > 1:
1024
+ # Bar chart
1025
+ st.subheader("Score Comparison")
1026
+
1027
+ chart_data = pd.DataFrame({
1028
+ 'Candidate': [r['name'][:20] + '...' if len(r['name']) > 20 else r['name']
1029
+ for r in st.session_state.results],
1030
+ 'Final Score': [r['final_score'] for r in st.session_state.results],
1031
+ 'Cross-Encoder': [r['cross_encoder_score'] for r in st.session_state.results],
1032
+ 'BM25': [r['bm25_score'] for r in st.session_state.results],
1033
+ 'Intent': [r['intent_score'] for r in st.session_state.results]
1034
+ })
1035
+
1036
+ st.bar_chart(chart_data.set_index('Candidate'))
1037
+
1038
+ # Score distribution
1039
+ col1, col2 = st.columns(2)
1040
 
1041
  with col1:
1042
+ st.subheader("Score Distribution")
1043
+ score_ranges = {
1044
+ 'Excellent (β‰₯1.2)': sum(1 for r in st.session_state.results if r['final_score'] >= 1.2),
1045
+ 'Good (1.0-1.2)': sum(1 for r in st.session_state.results if 1.0 <= r['final_score'] < 1.2),
1046
+ 'Fair (0.8-1.0)': sum(1 for r in st.session_state.results if 0.8 <= r['final_score'] < 1.0),
1047
+ 'Poor (<0.8)': sum(1 for r in st.session_state.results if r['final_score'] < 0.8),
1048
+ }
1049
 
1050
+ dist_df = pd.DataFrame({
1051
+ 'Range': score_ranges.keys(),
1052
+ 'Count': score_ranges.values()
1053
+ })
1054
+ st.bar_chart(dist_df.set_index('Range'))
1055
 
1056
  with col2:
1057
+ st.subheader("Average Scores")
1058
+ avg_final = np.mean([r['final_score'] for r in st.session_state.results])
1059
+ avg_cross = np.mean([r['cross_encoder_score'] for r in st.session_state.results])
1060
+ avg_bm25 = np.mean([r['bm25_score'] for r in st.session_state.results])
1061
+ avg_intent = np.mean([r['intent_score'] for r in st.session_state.results])
1062
 
1063
+ st.metric("Average Final Score", f"{avg_final:.2f}")
1064
+ st.metric("Average Cross-Encoder", f"{avg_cross:.2f}")
1065
+ st.metric("Average BM25", f"{avg_bm25:.2f}")
1066
+ st.metric("Average Intent", f"{avg_intent:.2f}")
 
 
 
 
 
 
 
 
 
 
 
1067
 
1068
  # Memory cleanup
1069
+ st.markdown("---")
1070
+ st.subheader("🧹 Reset Application")
1071
+ col1, col2, col3 = st.columns([1, 1, 3])
1072
+ with col1:
1073
+ if st.button("πŸ—‘οΈ Clear Resumes Only", type="secondary", help="Clear only the loaded resumes"):
1074
+ st.session_state.resume_texts = []
1075
+ st.session_state.file_names = []
1076
+ st.session_state.results = []
1077
+ st.session_state.explanations_generated = False
1078
+ st.session_state.current_job_description = ""
1079
+ st.success("βœ… Resumes cleared!")
1080
+ st.rerun()
1081
+
1082
+ with col2:
1083
+ if st.button("🧹 Clear Everything", type="primary", help="Clear all data and free memory"):
1084
+ st.session_state.resume_texts = []
1085
+ st.session_state.file_names = []
1086
+ st.session_state.results = []
1087
+ st.session_state.explanations_generated = False
1088
+ st.session_state.current_job_description = ""
1089
+
1090
+ if torch.cuda.is_available():
1091
+ torch.cuda.empty_cache()
1092
+ gc.collect()
1093
+ st.success("βœ… Everything cleared!")
1094
+ st.rerun()
1095
 
1096
  # Footer
1097
  st.markdown("---")
1098
  st.markdown(
1099
  """
1100
  <div style='text-align: center; color: #666;'>
1101
+ πŸš€ Powered by BAAI/bge-large-en-v1.5 & Qwen3-14B | Built with Streamlit
1102
  </div>
1103
  """,
1104
  unsafe_allow_html=True