root commited on
Commit
6ff5e82
·
1 Parent(s): 2e8072e
Files changed (5) hide show
  1. alt_models.py +0 -159
  2. app.py +497 -932
  3. explanation_generator.py +0 -223
  4. fix_dependencies.py +0 -76
  5. requirements.txt +8 -11
alt_models.py DELETED
@@ -1,159 +0,0 @@
1
- """
2
- Alternative model loading implementation without sys.modules patching
3
- """
4
-
5
- import torch
6
- from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
7
-
8
- def count_gpus():
9
- """Count the number of available GPUs"""
10
- if torch.cuda.is_available():
11
- return torch.cuda.device_count()
12
- return 0
13
-
14
- def load_embedding_model(model_name="nvidia/NV-Embed-v2"):
15
- """Load the embedding model with a try-except approach instead of module patching"""
16
- try:
17
- print(f"Loading embedding model {model_name}...")
18
-
19
- # Create a simple Replicate class that may be needed
20
- class Replicate(torch.nn.Module):
21
- def __init__(self, module, num_replicas=1):
22
- super().__init__()
23
- self.module = module
24
- self.num_replicas = num_replicas
25
-
26
- def forward(self, *args, **kwargs):
27
- return self.module(*args, **kwargs)
28
-
29
- # Get number of GPUs
30
- num_gpus = count_gpus()
31
- print(f"Found {num_gpus} GPUs")
32
-
33
- # Choose device map strategy based on GPU count
34
- if num_gpus > 1:
35
- # For multi-GPU setup, use balanced distribution
36
- device_map = "balanced"
37
- print(f"Using balanced device mapping across {num_gpus} GPUs")
38
- else:
39
- # For single GPU, use auto or specific mapping based on memory
40
- device_map = "auto"
41
- print("Using automatic device mapping")
42
-
43
- # Try the standard loading approach
44
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
45
- model = AutoModel.from_pretrained(
46
- model_name,
47
- trust_remote_code=True,
48
- device_map=device_map
49
- )
50
-
51
- print(f"Successfully loaded {model_name}")
52
- return model, tokenizer
53
- except Exception as e:
54
- # If the first approach fails, try with module.__dict__
55
- try:
56
- print(f"First loading approach failed: {str(e)}")
57
- print("Trying alternative loading approach...")
58
-
59
- # Import the module
60
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
61
-
62
- # Dynamically get the module
63
- model_class = AutoModel._MODEL_MAPPING[AutoModel._model_mapping[model_name]]
64
-
65
- # Add Replicate to the module's namespace
66
- model_class.__module_dict__ = {}
67
- model_class.__module_dict__["Replicate"] = Replicate
68
-
69
- # Get number of GPUs
70
- num_gpus = count_gpus()
71
-
72
- # Choose device map strategy based on GPU count
73
- if num_gpus > 1:
74
- device_map = "balanced"
75
- else:
76
- device_map = "auto"
77
-
78
- # Try loading with the augmented namespace
79
- model = model_class.from_pretrained(
80
- model_name,
81
- trust_remote_code=True,
82
- device_map=device_map
83
- )
84
-
85
- print(f"Successfully loaded {model_name} with alternative approach")
86
- return model, tokenizer
87
- except Exception as e2:
88
- print(f"Alternative loading approach also failed: {str(e2)}")
89
- print(f"Could not load embedding model {model_name}")
90
- return None, None
91
-
92
- def load_explanation_model(model_name="Qwen/QwQ-32B"):
93
- """Load the explanation model with a try-except approach instead of module patching"""
94
- try:
95
- print(f"Loading explanation model {model_name}...")
96
-
97
- # Get number of GPUs
98
- num_gpus = count_gpus()
99
- print(f"Found {num_gpus} GPUs")
100
-
101
- # Choose quantization and device strategy based on GPU count and memory
102
- if num_gpus > 1:
103
- # For multi-GPU, use 4-bit quantization and balanced distribution
104
- quantization_config = BitsAndBytesConfig(
105
- load_in_4bit=True,
106
- bnb_4bit_quant_type="nf4",
107
- bnb_4bit_compute_dtype=torch.float16,
108
- bnb_4bit_use_double_quant=True
109
- )
110
- device_map = "balanced"
111
- print(f"Using 4-bit quantization with balanced device mapping across {num_gpus} GPUs")
112
- else:
113
- # For single GPU, use more aggressive 4-bit quantization
114
- quantization_config = BitsAndBytesConfig(
115
- load_in_4bit=True,
116
- bnb_4bit_quant_type="nf4",
117
- bnb_4bit_compute_dtype=torch.float16,
118
- bnb_4bit_use_double_quant=True
119
- )
120
- device_map = "auto"
121
- print("Using 4-bit quantization with automatic device mapping")
122
-
123
- # Create a simple Replicate class that may be needed
124
- class Replicate(torch.nn.Module):
125
- def __init__(self, module, num_replicas=1):
126
- super().__init__()
127
- self.module = module
128
- self.num_replicas = num_replicas
129
-
130
- def forward(self, *args, **kwargs):
131
- return self.module(*args, **kwargs)
132
-
133
- # Try the standard loading approach
134
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
135
-
136
- # Check if we have enough resources to load the model
137
- if torch.cuda.is_available():
138
- total_gpu_memory = sum([torch.cuda.get_device_properties(i).total_memory for i in range(num_gpus)]) / (1024**3)
139
- if num_gpus > 1 or total_gpu_memory >= 16: # 16 GB (reduced thanks to quantization)
140
- model = AutoModelForCausalLM.from_pretrained(
141
- model_name,
142
- quantization_config=quantization_config,
143
- device_map=device_map,
144
- trust_remote_code=True,
145
- torch_dtype=torch.float16,
146
- max_memory={i: f"{int(torch.cuda.get_device_properties(i).total_memory / (1024**3) * 0.9)}GiB" for i in range(num_gpus)}
147
- )
148
- print(f"Successfully loaded {model_name}")
149
- return model, tokenizer
150
- else:
151
- print("Not enough GPU memory, using template-based explanations")
152
- return None, tokenizer
153
- else:
154
- print("CUDA not available, using template-based explanations")
155
- return None, tokenizer
156
- except Exception as e:
157
- print(f"Error loading explanation model: {str(e)}")
158
- print("Falling back to template-based explanations.")
159
- return None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import streamlit as st
2
- import pdfplumber
3
  import pandas as pd
4
  import numpy as np
5
  import torch
@@ -8,59 +7,18 @@ import faiss
8
  import os
9
  import tempfile
10
  import base64
 
 
11
  from rank_bm25 import BM25Okapi
12
- from transformers import AutoModel, AutoTokenizer
13
- from sentence_transformers import SentenceTransformer
14
  from nltk.tokenize import word_tokenize, sent_tokenize
15
  from tqdm import tqdm
16
- import re
17
- import io
18
  import PyPDF2
19
  from docx import Document
20
  import csv
21
- import sys
22
-
23
- # Use the alternative model loading approach
24
- try:
25
- # Try to import the functions from alt_models.py
26
- from alt_models import load_embedding_model, load_explanation_model
27
- USE_ALT_MODELS = True
28
- except ImportError:
29
- USE_ALT_MODELS = False
30
- # If import fails, we'll use the original approach
31
- # Add Replicate class workaround
32
- class Replicate(torch.nn.Module):
33
- """Workaround class for missing Replicate in NV-Embed and Qwen models"""
34
- def __init__(self, module, num_replicas=1):
35
- super().__init__()
36
- self.module = module
37
- self.num_replicas = num_replicas
38
-
39
- def forward(self, *args, **kwargs):
40
- return self.module(*args, **kwargs)
41
-
42
- # Create module structure if it doesn't exist yet
43
- # Handle NVIDIA module
44
- if "transformers.models.nvembed.modeling_nvembed" not in sys.modules:
45
- # Create parent modules if they don't exist
46
- if "transformers.models.nvembed" not in sys.modules:
47
- sys.modules["transformers.models.nvembed"] = type('', (), {})
48
- # Create the module we need
49
- sys.modules["transformers.models.nvembed.modeling_nvembed"] = type('', (), {})
50
-
51
- # Handle Qwen module
52
- if "transformers.models.qwen2.modeling_qwen2" not in sys.modules:
53
- # Create parent modules if they don't exist
54
- if "transformers.models.qwen2" not in sys.modules:
55
- sys.modules["transformers.models.qwen2"] = type('', (), {})
56
- # Create the module we need
57
- sys.modules["transformers.models.qwen2.modeling_qwen2"] = type('', (), {})
58
-
59
- # Add the class to modules
60
- sys.modules["transformers.models.nvembed.modeling_nvembed"].Replicate = Replicate
61
- sys.modules["transformers.models.qwen2.modeling_qwen2"].Replicate = Replicate
62
-
63
- from explanation_generator import ExplanationGenerator
64
 
65
  # Download NLTK resources
66
  try:
@@ -68,49 +26,17 @@ try:
68
  except LookupError:
69
  nltk.download('punkt')
70
 
71
- # Initialize embedding model at startup
72
- EMBEDDING_MODEL_NAME = "nvidia/NV-Embed-v2"
73
-
74
- if USE_ALT_MODELS:
75
- # Use the alternative loading approach
76
- global_embedding_model, global_embedding_tokenizer = load_embedding_model(EMBEDDING_MODEL_NAME)
77
- else:
78
- # Use the original approach
79
- print(f"Loading embedding model {EMBEDDING_MODEL_NAME}...")
80
- try:
81
- # Load embedding model and tokenizer
82
- global_embedding_tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME, trust_remote_code=True)
83
- global_embedding_model = AutoModel.from_pretrained(EMBEDDING_MODEL_NAME, trust_remote_code=True, device_map="auto")
84
- print(f"Successfully loaded {EMBEDDING_MODEL_NAME}")
85
- except Exception as e:
86
- print(f"Error loading embedding model: {str(e)}")
87
- global_embedding_tokenizer = None
88
- global_embedding_model = None
89
-
90
  # Set page configuration
91
  st.set_page_config(
92
- page_title="Resume Screener & Skill Extractor",
93
- page_icon="📄",
94
  layout="wide",
95
  initial_sidebar_state="expanded"
96
  )
97
 
98
- # Sidebar for model selection and weights
99
  with st.sidebar:
100
- st.title("Configuration")
101
-
102
- # Model selection
103
- embedding_model_name = st.selectbox(
104
- "Embedding Model",
105
- ["nvidia/NV-Embed-v2"],
106
- index=0
107
- )
108
-
109
- explanation_model_name = st.selectbox(
110
- "Explanation Model",
111
- ["Qwen/Qwen3-14B"],
112
- index=0
113
- )
114
 
115
  # Ranking weights
116
  st.subheader("Ranking Weights")
@@ -120,304 +46,202 @@ with st.sidebar:
120
 
121
  # Advanced options
122
  st.subheader("Advanced Options")
123
- top_k = st.number_input("Number of results to display", min_value=1, max_value=20, value=10, step=1)
124
- use_explanation = st.checkbox("Generate Explanations", value=True)
125
- use_faiss = st.checkbox("Use FAISS for fast search", value=True)
126
-
127
- # Memory optimization options
128
- st.subheader("Memory Optimization")
129
- memory_optimization = st.checkbox("Enable memory optimization (for large datasets)", value=False)
130
- clear_embeddings = st.checkbox("Clear embeddings after processing", value=False)
131
- gc_collect_interval = st.number_input(
132
- "Garbage collection interval (files)",
133
- min_value=10,
134
- max_value=1000,
135
- value=100,
136
- step=10,
137
- help="Run garbage collection after processing this many files"
138
- )
139
 
140
  st.markdown("---")
141
- st.markdown("### About")
142
- st.markdown("This app uses a hybrid ranking system combining semantic similarity with keyword matching to find the most suitable resumes for a job position.")
 
 
 
143
 
144
- # Initialize session state variables
145
- if 'resumes_uploaded' not in st.session_state:
146
- st.session_state.resumes_uploaded = False
147
- if 'job_description' not in st.session_state:
148
- st.session_state.job_description = ""
149
  if 'results' not in st.session_state:
150
  st.session_state.results = []
151
- if 'embedding_model' not in st.session_state:
152
- st.session_state.embedding_model = global_embedding_model
153
- if 'tokenizer' not in st.session_state:
154
- st.session_state.tokenizer = global_embedding_tokenizer
155
- if 'faiss_index' not in st.session_state:
156
- st.session_state.faiss_index = None
157
- if 'explanation_generator' not in st.session_state:
158
- st.session_state.explanation_generator = None
159
 
160
- class ResumeScreener:
161
- def __init__(self, embedding_model_name="nvidia/NV-Embed-v2", explanation_model_name="Qwen/Qwen3-14B"):
162
- """Initialize the ResumeScreener with the specified embedding model"""
163
- self.embedding_model_name = embedding_model_name
164
- self.explanation_model_name = explanation_model_name
165
- # Initialize with preloaded models
166
- self.model = st.session_state.embedding_model
167
- self.tokenizer = st.session_state.tokenizer
168
- self.faiss_index = None
169
- self.embedding_size = None
170
- self.explanation_generator = None
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
- # Initialize explanation generator
173
- if use_explanation and st.session_state.explanation_generator is None:
174
- with st.spinner("Initializing explanation generator..."):
175
- st.session_state.explanation_generator = ExplanationGenerator(self.explanation_model_name)
176
- self.explanation_generator = st.session_state.explanation_generator
177
- elif use_explanation:
178
- self.explanation_generator = st.session_state.explanation_generator
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
 
180
- def extract_text_from_file(self, file, file_type):
181
  """Extract text from various file types"""
182
  try:
183
  if file_type == "pdf":
184
- # Use pdfplumber for better text extraction
185
- with pdfplumber.open(file) as pdf:
186
- text = ""
187
- for page in pdf.pages:
188
- text += page.extract_text() or ""
189
-
190
- # If pdfplumber fails, try PyPDF2 as fallback
191
- if not text.strip():
192
- reader = PyPDF2.PdfReader(file)
193
  text = ""
194
- for page_num in range(len(reader.pages)):
195
- page = reader.pages[page_num]
196
  text += page.extract_text() or ""
197
-
198
- return text
199
-
 
 
 
 
 
 
 
200
  elif file_type == "docx":
201
- doc = Document(file)
202
  return " ".join([paragraph.text for paragraph in doc.paragraphs])
203
 
204
  elif file_type == "txt":
205
- return file.read().decode("utf-8")
206
-
 
207
  elif file_type == "csv":
208
- csv_text = ""
209
- csv_reader = csv.reader(io.StringIO(file.read().decode("utf-8")))
210
- for row in csv_reader:
211
- csv_text += " ".join(row) + " "
212
- return csv_text
213
-
214
- else:
215
- st.error(f"Unsupported file type: {file_type}")
216
- return ""
217
-
218
  except Exception as e:
219
- st.error(f"Error extracting text from file: {str(e)}")
220
  return ""
221
 
222
  def get_embedding(self, text):
223
- """Generate text embedding for a given text"""
224
- if self.model is None:
225
- st.error("Embedding model not available. Please check your environment.")
226
- return np.zeros(768) # Default embedding size as fallback
227
 
228
  try:
229
- # For HuggingFace models
230
- inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
 
 
 
 
 
 
 
 
231
 
232
- # Move inputs to same device as model
233
- device = next(self.model.parameters()).device
234
  inputs = {k: v.to(device) for k, v in inputs.items()}
235
 
236
  with torch.no_grad():
237
- outputs = self.model(**inputs)
238
-
239
- # Handle specific case for NV-Embed-v2 which returns a nested structure
240
- if self.embedding_model_name == "nvidia/NV-Embed-v2":
241
- # Access the embedding from the NV-Embed specific output format
242
- if hasattr(outputs, "pooler_output"):
243
- embeddings = outputs.pooler_output
244
- embedding_np = embeddings.cpu().detach().numpy()
245
- if self.embedding_size is None:
246
- self.embedding_size = embedding_np.shape[1]
247
- return embedding_np[0] # Return the first embedding
248
- # Try to handle multi-level dictionary if the model changed output format
249
- elif isinstance(outputs, dict) and "embedding" in outputs:
250
- embeddings = outputs["embedding"]
251
- embedding_np = embeddings.cpu().detach().numpy()
252
- if self.embedding_size is None:
253
- self.embedding_size = embedding_np.shape[1]
254
- return embedding_np[0]
255
-
256
- # Handle different output structures
257
- if hasattr(outputs, "last_hidden_state"):
258
- # Mean pooling across token dimension
259
- embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
260
- embedding_np = embeddings.cpu().detach().numpy()
261
-
262
- # Set embedding size if not set
263
- if self.embedding_size is None:
264
- self.embedding_size = embedding_np.shape[0]
265
-
266
- return embedding_np
267
- elif isinstance(outputs, dict) and "embeddings" in outputs:
268
- # For models that return a dictionary with embeddings
269
- embeddings = outputs["embeddings"]
270
- embedding_np = embeddings.cpu().detach().numpy()
271
 
272
- # Set embedding size if not set
273
- if self.embedding_size is None:
274
- self.embedding_size = embedding_np.shape[1] # Use correct dimension
275
-
276
- return embedding_np[0] # Return the first embedding
277
- elif isinstance(outputs, torch.Tensor):
278
- # For models that return a tensor directly
279
- embedding_np = outputs.cpu().detach().numpy()
280
-
281
- # Set embedding size if not set
282
- if self.embedding_size is None:
283
- self.embedding_size = embedding_np.shape[-1]
284
-
285
- return embedding_np.squeeze()
286
  else:
287
- # If we can't determine the output structure, try to inspect it for debugging
288
- st.warning(f"Unexpected output structure from model: {type(outputs)}")
289
- if hasattr(outputs, "__dict__"):
290
- for attr_name in dir(outputs):
291
- if not attr_name.startswith('_'):
292
- attr = getattr(outputs, attr_name)
293
- if isinstance(attr, torch.Tensor):
294
- st.info(f"Found tensor attribute '{attr_name}' with shape {attr.shape}")
295
- embedding_np = attr.cpu().detach().numpy()
296
- if self.embedding_size is None:
297
- self.embedding_size = embedding_np.shape[-1]
298
- return embedding_np.squeeze()
299
 
300
- # Last resort: return zeros
301
- if self.embedding_size is None:
302
- self.embedding_size = 768 # Default size
303
- return np.zeros(self.embedding_size)
304
  except Exception as e:
305
  st.error(f"Error generating embedding: {str(e)}")
306
- if self.embedding_size is None:
307
- self.embedding_size = 768 # Default size
308
- return np.zeros(self.embedding_size)
309
-
310
- def create_faiss_index(self, embeddings):
311
- """Create a FAISS index for fast similarity search"""
312
- # Get the dimension of the embeddings
313
- dimension = embeddings[0].shape[0]
314
-
315
- # Create a FAISS index
316
- index = faiss.IndexFlatIP(dimension) # Inner product for cosine similarity with normalized vectors
317
-
318
- # Add normalized vectors to the index
319
- embeddings_normalized = np.vstack([emb / np.linalg.norm(emb) for emb in embeddings])
320
- index.add(embeddings_normalized)
321
-
322
- return index
323
-
324
- def query_faiss_index(self, index, query_embedding, k=10):
325
- """Query the FAISS index with a query embedding"""
326
- # Normalize query embedding
327
- query_embedding = query_embedding / np.linalg.norm(query_embedding)
328
-
329
- # Reshape to a row vector if needed
330
- if len(query_embedding.shape) == 1:
331
- query_embedding = query_embedding.reshape(1, -1)
332
-
333
- # Query the index
334
- scores, indices = index.search(query_embedding, k)
335
-
336
- return scores[0], indices[0] # Return the scores and indices as flat arrays
337
 
338
  def calculate_bm25_scores(self, resume_texts, job_description):
339
  """Calculate BM25 scores for keyword matching"""
340
- # Tokenize job description
341
- job_tokens = word_tokenize(job_description.lower())
342
-
343
- # Prepare corpus from resumes
344
- corpus = [word_tokenize(resume.lower()) for resume in resume_texts]
345
-
346
- # Check if corpus is empty
347
- if not corpus or len(corpus) == 0:
348
- st.error("No resume texts provided for BM25 calculation")
349
- return [0.0] * len(resume_texts)
350
-
351
- # Check for empty documents in corpus
352
- filtered_corpus = [doc for doc in corpus if len(doc) > 0]
353
- if not filtered_corpus:
354
- st.error("All resume texts are empty after tokenization")
355
- return [0.0] * len(resume_texts)
356
-
357
- # Initialize BM25
358
  try:
359
- bm25 = BM25Okapi(filtered_corpus)
360
-
361
- # Calculate scores
362
- scores = bm25.get_scores(job_tokens)
363
-
364
- # If we filtered out empty documents, we need to reconstruct the scores array
365
- if len(filtered_corpus) != len(corpus):
366
- full_scores = []
367
- filtered_idx = 0
368
- for i in range(len(corpus)):
369
- if len(corpus[i]) > 0:
370
- full_scores.append(scores[filtered_idx])
371
- filtered_idx += 1
372
- else:
373
- full_scores.append(0.0)
374
- return full_scores
375
- else:
376
- return scores
377
  except Exception as e:
378
- st.error(f"Error in BM25 calculation: {str(e)}")
379
  return [0.0] * len(resume_texts)
380
 
381
- def calculate_hybrid_scores(self, resume_texts, resume_embeddings, job_embedding, semantic_weight=0.7, use_faiss=True):
382
- """Calculate hybrid scores combining semantic similarity and BM25"""
383
- # Calculate semantic similarity scores (cosine similarity)
384
- if use_faiss and len(resume_embeddings) > 10:
385
- # Create FAISS index if not already created
386
- if st.session_state.faiss_index is None:
387
- index = self.create_faiss_index(resume_embeddings)
388
- st.session_state.faiss_index = index
389
- else:
390
- index = st.session_state.faiss_index
391
-
392
- # Query index with job embedding
393
- faiss_scores, faiss_indices = self.query_faiss_index(index, job_embedding, k=len(resume_embeddings))
394
-
395
- # Create full semantic scores array
396
- semantic_scores = np.zeros(len(resume_embeddings))
397
- for i, idx in enumerate(faiss_indices):
398
- if idx < len(resume_embeddings):
399
- semantic_scores[idx] = faiss_scores[i]
400
- else:
401
- # Direct cosine similarity calculation for smaller datasets
402
- semantic_scores = []
403
- for emb in resume_embeddings:
404
- # Normalize the embeddings for cosine similarity
405
- emb_norm = emb / np.linalg.norm(emb)
406
- job_emb_norm = job_embedding / np.linalg.norm(job_embedding)
407
-
408
- # Calculate cosine similarity
409
- similarity = np.dot(emb_norm, job_emb_norm)
410
- semantic_scores.append(similarity)
411
 
412
  # Calculate BM25 scores
413
  bm25_scores = self.calculate_bm25_scores(resume_texts, job_description)
414
 
415
  # Normalize BM25 scores
416
- if max(bm25_scores) > 0:
417
- bm25_scores = [score / max(bm25_scores) for score in bm25_scores]
 
418
 
419
  # Calculate hybrid scores
420
- keyword_weight = 1.0 - semantic_weight
421
  hybrid_scores = [
422
  (semantic_weight * sem_score) + (keyword_weight * bm25_score)
423
  for sem_score, bm25_score in zip(semantic_scores, bm25_scores)
@@ -426,682 +250,423 @@ class ResumeScreener:
426
  return hybrid_scores, semantic_scores, bm25_scores
427
 
428
  def extract_skills(self, text, job_description):
429
- """Extract skills from text based on job description"""
430
- # Simple skill extraction using regex and job description keywords
431
- # In a real implementation, this could be enhanced with ML-based skill extraction
432
-
433
- # Extract potential skills from job description (words 3 letters or longer)
434
- potential_skills = set()
435
-
436
- # Common skill-related phrases that might appear in job descriptions
437
- skill_indicators = ["experience with", "knowledge of", "familiar with", "proficient in",
438
- "skills in", "expertise in", "background in", "capabilities in",
439
- "years of experience in", "understanding of", "trained in"]
440
-
441
- # Extract skills from sentences containing skill indicators
442
- sentences = sent_tokenize(job_description)
443
- for sentence in sentences:
444
- sentence_lower = sentence.lower()
445
- for indicator in skill_indicators:
446
- if indicator in sentence_lower:
447
- # Extract words after the indicator, possibly until end of sentence or punctuation
448
- skills_part = sentence_lower.split(indicator, 1)[1]
449
-
450
- # Extract words, cleaning up symbols
451
- words = re.findall(r'\b[a-zA-Z0-9+#/.]+\b', skills_part)
452
- for word in words:
453
- if len(word) >= 3: # Only consider words 3 letters or longer
454
- potential_skills.add(word.lower())
455
 
456
- # Add explicit skills - look for comma-separated lists or bullet points
457
- skill_lists = re.findall(r'(?:skills|requirements|qualifications)[^\n.]*?:(.+?)(?:\n|$)', job_description.lower())
458
- for skill_list in skill_lists:
459
- words = re.findall(r'\b[a-zA-Z0-9+#/.]+\b', skill_list)
460
- for word in words:
461
- if len(word) >= 3:
462
- potential_skills.add(word.lower())
463
 
464
- # Add common tech skills if they appear in the job description
465
- common_tech_skills = ["python", "java", "c++", "javascript", "sql", "react", "node.js", "typescript",
466
- "html", "css", "aws", "azure", "gcp", "docker", "kubernetes", "terraform",
467
- "git", "ci/cd", "agile", "scrum", "rest", "graphql", "ml", "ai", "data science"]
468
 
469
- for skill in common_tech_skills:
470
- if skill in job_description.lower():
471
- potential_skills.add(skill)
 
472
 
473
- # Find skills in the resume
474
- matched_skills = []
475
- for skill in potential_skills:
476
- # Make it a word boundary search with regex
477
- pattern = r'\b' + re.escape(skill) + r'\b'
478
- matches = re.findall(pattern, text.lower())
479
- if matches:
480
- matched_skills.append(skill)
481
 
482
- return list(set(matched_skills))
483
-
484
- def extract_key_phrases(self, text, job_description):
485
- """Extract key phrases from text that match job description keywords"""
486
- # Identify job skills first
487
- skills = self.extract_skills(job_description, job_description)
488
-
489
- # Extract sentences that contain skills
490
- sentences = sent_tokenize(text)
491
- skill_sentences = []
492
-
493
- for sentence in sentences:
494
- sentence_lower = sentence.lower()
495
- for skill in skills:
496
- if skill in sentence_lower:
497
- # Append the sentence with the skill highlighted
498
- highlighted = sentence.replace(skill, f"**{skill}**")
499
- skill_sentences.append(highlighted)
500
- break
501
-
502
- # Get additional generic matches if we don't have enough skill sentences
503
- if len(skill_sentences) < 5:
504
- # Simple extraction based on job description keywords
505
- job_tokens = set(word.lower() for word in word_tokenize(job_description) if len(word) > 3)
506
- text_tokens = word_tokenize(text)
507
-
508
- matches = []
509
- for i, token in enumerate(text_tokens):
510
- if token.lower() in job_tokens:
511
- # Get a phrase context (5 words before and after)
512
- start = max(0, i - 5)
513
- end = min(len(text_tokens), i + 6)
514
- phrase = " ".join(text_tokens[start:end])
515
- matches.append(phrase)
516
-
517
- # Add unique phrases to complement skill sentences
518
- unique_matches = list(set(matches))
519
- skill_sentences.extend(unique_matches[:5 - len(skill_sentences)])
520
-
521
- # Return unique phrases, up to 5
522
- return skill_sentences[:5]
523
 
524
  def generate_explanation(self, resume_text, job_description, score, semantic_score, bm25_score, skills):
525
- """Generate explanation for why a resume was ranked highly using QwQ-32B model"""
526
- # Use the explanation generator if available
527
- if use_explanation and self.explanation_generator:
528
- return self.explanation_generator.generate_explanation(
529
- resume_text,
530
- job_description,
531
- score,
532
- semantic_score,
533
- bm25_score,
534
- skills
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
535
  )
536
- else:
537
- # Fallback to simple explanation
538
- matching_phrases = self.extract_key_phrases(resume_text, job_description)
539
 
540
- explanation = f"This resume received a score of {score:.2f}, with semantic relevance of {semantic_score:.2f} and keyword match of {bm25_score:.2f}. "
541
 
542
- if skills:
543
- explanation += f"The resume shows experience with key skills: {', '.join(skills[:5])}. "
 
 
 
 
 
 
 
 
 
 
 
544
 
545
- if matching_phrases:
546
- explanation += f"Key matching elements include: {matching_phrases[0]}"
547
 
548
- return explanation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
549
 
550
- # Function to create a download link for dataframe as CSV
551
- def get_csv_download_link(df, filename="results.csv"):
552
  csv = df.to_csv(index=False)
553
  b64 = base64.b64encode(csv.encode()).decode()
554
- href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">Download CSV</a>'
555
- return href
556
 
557
- # Add this new function after the get_csv_download_link function
558
- def get_huggingface_spaces_datasets():
559
- """Check for datasets in Hugging Face Spaces environment"""
560
- datasets = []
561
-
562
- # Common dataset paths in Hugging Face Spaces
563
- potential_paths = [
564
- "/data", # Common mount point
565
- "data", # Relative path
566
- os.path.expanduser("~/data"), # Home directory
567
- ]
568
-
569
- for path in potential_paths:
570
- if os.path.exists(path) and os.path.isdir(path):
571
- # Look for CSV files
572
- csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]
573
- for csv_file in csv_files:
574
- datasets.append(os.path.join(path, csv_file))
575
-
576
- # Look for directories that might contain PDFs
577
- for subdir in os.listdir(path):
578
- subdir_path = os.path.join(path, subdir)
579
- if os.path.isdir(subdir_path):
580
- pdf_count = len([f for f in os.listdir(subdir_path) if f.lower().endswith('.pdf')])
581
- if pdf_count > 0:
582
- datasets.append((subdir_path, f"PDF Directory ({pdf_count} files)"))
583
-
584
- return datasets
585
-
586
- # Main app UI
587
- st.title("Resume Screener & Skill Extractor")
588
  st.markdown("---")
589
 
590
- # Initialize the resume screener
591
- screener = ResumeScreener(embedding_model_name, explanation_model_name)
 
 
 
 
 
592
 
593
- # Job description input
594
- st.header("1. Enter Job Description")
595
  job_description = st.text_area(
596
- "Paste the job description or requirements here:",
597
- height=200,
598
- help="Enter the complete job description or a list of required skills and qualifications."
599
  )
600
 
601
- # Resume upload
602
- st.header("2. Upload Resumes")
603
- upload_option = st.radio(
604
- "Choose upload method:",
605
- ["Upload Files", "Upload from Dataset", "Process Directory"]
 
606
  )
607
 
608
- uploaded_files = []
609
  resume_texts = []
610
  file_names = []
611
 
612
- if upload_option == "Upload Files":
613
  uploaded_files = st.file_uploader(
614
  "Upload resume files",
615
- type=["pdf", "docx", "txt", "csv"],
616
  accept_multiple_files=True,
617
- help="Upload multiple resume files in PDF, DOCX, TXT, or CSV format."
618
  )
619
 
620
  if uploaded_files:
621
- with st.spinner("Processing resumes..."):
622
  for file in uploaded_files:
623
  file_type = file.name.split('.')[-1].lower()
624
 
 
625
  with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{file_type}') as tmp_file:
626
  tmp_file.write(file.getvalue())
627
  tmp_path = tmp_file.name
628
 
 
629
  text = screener.extract_text_from_file(tmp_path, file_type)
630
- if text:
631
  resume_texts.append(text)
632
  file_names.append(file.name)
633
 
634
- # Clean up temp file
635
  os.unlink(tmp_path)
636
-
637
- st.session_state.resumes_uploaded = True
638
- st.success(f"Successfully processed {len(resume_texts)} resumes.")
639
- elif upload_option == "Process Directory":
640
- st.write("Process resume files from a directory on the server.")
641
-
642
- # Input for directory path
643
- resume_dir = st.text_input(
644
- "Enter the path to the directory containing resume files:",
645
- help="For Hugging Face Spaces, this could be a mounted directory or dataset."
646
- )
647
-
648
- # Limit batch size
649
- batch_size = st.number_input(
650
- "Number of files to process per batch (lower for less memory usage):",
651
- min_value=10,
652
- max_value=1000,
653
- value=100,
654
- step=10
655
- )
656
-
657
- # File types to process
658
- file_types = st.multiselect(
659
- "Select file types to process:",
660
- ["pdf", "docx", "txt", "csv"],
661
- default=["pdf"]
662
- )
663
 
664
- if resume_dir and st.button("Process Directory"):
665
- if os.path.isdir(resume_dir):
666
- # Get all files matching the selected types
667
- all_files = []
668
- for file_type in file_types:
669
- all_files.extend([
670
- os.path.join(resume_dir, f)
671
- for f in os.listdir(resume_dir)
672
- if f.lower().endswith(f'.{file_type}')
673
- ])
674
 
675
- if all_files:
676
- total_files = len(all_files)
677
- st.write(f"Found {total_files} files. Processing in batches of {batch_size}...")
678
-
679
- # Process in batches
680
- processed_count = 0
681
- progress_bar = st.progress(0)
682
- status_text = st.empty()
683
-
684
- for i in range(0, total_files, batch_size):
685
- batch_files = all_files[i:i+batch_size]
686
-
687
- for j, file_path in enumerate(batch_files):
688
- try:
689
- file_type = file_path.split('.')[-1].lower()
690
- text = screener.extract_text_from_file(file_path, file_type)
691
- if text:
692
- resume_texts.append(text)
693
- file_names.append(os.path.basename(file_path))
694
- processed_count += 1
695
-
696
- # Apply memory optimization if enabled
697
- if memory_optimization and j % gc_collect_interval == 0 and j > 0:
698
- import gc
699
- gc.collect()
700
- status_text.text(f"Processed {processed_count}/{total_files} files... (ran GC)")
701
- except Exception as e:
702
- st.warning(f"Error processing {file_path}: {str(e)}")
703
-
704
- # Update progress
705
- progress = min(1.0, (i + len(batch_files)) / total_files)
706
- progress_bar.progress(progress)
707
- status_text.text(f"Processed {processed_count}/{total_files} files...")
708
-
709
- # Run garbage collection between batches if memory optimization is enabled
710
- if memory_optimization:
711
- import gc
712
- gc.collect()
713
-
714
- # Final garbage collection if memory optimization is enabled
715
- if memory_optimization:
716
- import gc
717
- gc.collect()
718
-
719
- st.session_state.resumes_uploaded = True
720
- st.success(f"Successfully processed {processed_count} out of {total_files} resume files.")
721
- else:
722
- st.error(f"No matching files found in {resume_dir}")
723
- else:
724
- st.error(f"Directory {resume_dir} does not exist or is not accessible.")
725
- elif upload_option == "Upload from Dataset":
726
- # Upload from Dataset implementation
727
- st.write("Upload a CSV file containing resume data or load from available datasets.")
728
-
729
- # Check for available datasets in Hugging Face Spaces
730
- hf_datasets = get_huggingface_spaces_datasets()
731
-
732
- if hf_datasets:
733
- st.subheader("Available Datasets in Hugging Face Spaces")
734
- dataset_options = ["None"] + [os.path.basename(ds) if isinstance(ds, str) else f"{os.path.basename(ds[0])} ({ds[1]})" for ds in hf_datasets]
735
- selected_dataset = st.selectbox("Select a dataset:", dataset_options)
736
-
737
- if selected_dataset != "None":
738
- selected_index = dataset_options.index(selected_dataset) - 1 # Adjust for "None"
739
- dataset_path = hf_datasets[selected_index]
740
 
741
- if isinstance(dataset_path, tuple):
742
- # It's a PDF directory
743
- pdf_dir = dataset_path[0]
744
- st.write(f"Selected PDF directory: {pdf_dir}")
745
-
746
- batch_size = st.number_input(
747
- "Number of files to process per batch:",
748
- min_value=10,
749
- max_value=1000,
750
- value=100,
751
- step=10
752
- )
753
-
754
- if st.button("Process PDF Directory"):
755
- # Use the same processing logic as in the "Process Directory" option
756
- if os.path.isdir(pdf_dir):
757
- all_files = [
758
- os.path.join(pdf_dir, f)
759
- for f in os.listdir(pdf_dir)
760
- if f.lower().endswith('.pdf')
761
- ]
762
-
763
- if all_files:
764
- total_files = len(all_files)
765
- st.write(f"Found {total_files} PDF files. Processing in batches of {batch_size}...")
766
-
767
- # Process in batches
768
- processed_count = 0
769
- progress_bar = st.progress(0)
770
- status_text = st.empty()
771
-
772
- for i in range(0, total_files, batch_size):
773
- batch_files = all_files[i:i+batch_size]
774
-
775
- for j, file_path in enumerate(batch_files):
776
- try:
777
- text = screener.extract_text_from_file(file_path, "pdf")
778
- if text:
779
- resume_texts.append(text)
780
- file_names.append(os.path.basename(file_path))
781
- processed_count += 1
782
-
783
- # Apply memory optimization if enabled
784
- if memory_optimization and j % gc_collect_interval == 0 and j > 0:
785
- import gc
786
- gc.collect()
787
- except Exception as e:
788
- st.warning(f"Error processing {file_path}: {str(e)}")
789
-
790
- # Update progress
791
- progress = min(1.0, (i + len(batch_files)) / total_files)
792
- progress_bar.progress(progress)
793
- status_text.text(f"Processed {processed_count}/{total_files} files...")
794
-
795
- # Memory optimization
796
- if memory_optimization:
797
- import gc
798
- gc.collect()
799
 
800
- st.session_state.resumes_uploaded = True
801
- st.success(f"Successfully processed {processed_count} out of {total_files} PDF files.")
802
- else:
803
- # It's a CSV file
804
- st.write(f"Selected CSV dataset: {dataset_path}")
805
 
806
- try:
807
- # Read the CSV file
808
- df = pd.read_csv(dataset_path)
809
 
810
- # Let user select which column contains the resume text
811
- text_column = st.selectbox(
812
- "Select column containing resume text:",
813
- df.columns.tolist()
814
- )
815
-
816
- if st.button("Process Selected CSV"):
817
- # Extract text from the selected column
818
- for i, row in df.iterrows():
819
- text = str(row[text_column])
820
- if text and not pd.isna(text):
821
- resume_texts.append(text)
822
- # Use index as filename if no filename column
823
- file_name = f"resume_{i}.txt"
824
- if 'filename' in df.columns:
825
- file_name = row['filename']
826
- file_names.append(file_name)
827
-
828
- st.session_state.resumes_uploaded = True
829
- st.success(f"Successfully processed {len(resume_texts)} resumes from CSV.")
830
- except Exception as e:
831
- st.error(f"Error processing CSV: {str(e)}")
832
-
833
- # Rest of the existing Upload from Dataset code
834
- dataset_option = st.radio(
835
- "Dataset source:",
836
- ["Upload CSV", "Use Hugging Face Dataset"]
837
- )
838
-
839
- if dataset_option == "Upload CSV":
840
- csv_file = st.file_uploader(
841
- "Upload CSV file containing resume data",
842
- type=["csv"],
843
- help="CSV should contain at least a column with resume text."
844
  )
845
-
846
- if csv_file:
847
- with st.spinner("Processing CSV data..."):
848
- # Read the CSV file
849
- df = pd.read_csv(csv_file)
 
 
850
 
851
- # Let user select which column contains the resume text
852
- text_column = st.selectbox(
853
- "Select column containing resume text:",
854
- df.columns.tolist()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
855
  )
856
-
857
- if st.button("Process Dataset"):
858
- # Extract text from the selected column
859
- for i, row in df.iterrows():
860
- text = str(row[text_column])
861
- if text and not pd.isna(text):
862
- resume_texts.append(text)
863
- # Use index as filename if no filename column
864
- file_name = f"resume_{i}.txt"
865
- if 'filename' in df.columns:
866
- file_name = row['filename']
867
- file_names.append(file_name)
868
-
869
- st.session_state.resumes_uploaded = True
870
- st.success(f"Successfully processed {len(resume_texts)} resumes from CSV.")
871
- else:
872
- # Hugging Face Dataset option
873
- dataset_name = st.text_input("Enter Hugging Face dataset name (e.g., 'user/resume_dataset'):")
874
- split = st.text_input("Enter dataset split (e.g., 'train'):", "train")
875
-
876
- if dataset_name and st.button("Load Dataset"):
877
- with st.spinner(f"Loading dataset {dataset_name}..."):
878
- try:
879
- from datasets import load_dataset
880
 
881
- # Load the dataset
882
- dataset = load_dataset(dataset_name, split=split)
 
883
 
884
- # Display dataset info
885
- st.write(f"Dataset loaded with {len(dataset)} entries.")
886
 
887
- # Let user select which column contains the resume text
888
- if len(dataset.column_names) > 0:
889
- text_column = st.selectbox(
890
- "Select column containing resume text:",
891
- dataset.column_names
892
- )
893
 
894
- if st.button("Process Hugging Face Dataset"):
895
- # Extract text from the selected column
896
- for i, item in enumerate(dataset):
897
- if text_column in item:
898
- text = str(item[text_column])
899
- if text:
900
- resume_texts.append(text)
901
- # Use index or id field as filename
902
- file_name = f"resume_{i}.txt"
903
- if 'id' in item:
904
- file_name = f"resume_{item['id']}.txt"
905
- file_names.append(file_name)
906
 
907
- st.session_state.resumes_uploaded = True
908
- st.success(f"Successfully processed {len(resume_texts)} resumes from Hugging Face dataset.")
909
- except Exception as e:
910
- st.error(f"Error loading dataset: {str(e)}")
911
- st.info("Make sure you have the 'datasets' library installed: pip install datasets")
 
 
 
 
 
 
912
 
913
- # Process button
914
- if st.button("Find Top Candidates", disabled=not (job_description and resume_texts)):
915
- with st.spinner("Processing job description and resumes..."):
916
- # Check if we have too many resumes to process at once
917
- large_dataset = len(resume_texts) > 1000
918
-
919
- # Get job description embedding
920
- job_embedding = screener.get_embedding(job_description)
921
-
922
- # For large datasets, we need to process in batches
923
- if large_dataset:
924
- st.info(f"Processing {len(resume_texts)} resumes in batches to manage memory usage")
925
-
926
- # Process in batches of 500 resumes
927
- batch_size = 500
928
- all_hybrid_scores = []
929
- all_semantic_scores = []
930
- all_bm25_scores = []
931
-
932
- # Calculate BM25 scores first (doesn't require GPU)
933
- bm25_scores = screener.calculate_bm25_scores(resume_texts, job_description)
934
 
935
- # Process embeddings in batches
936
- progress_bar = st.progress(0)
937
- for i in range(0, len(resume_texts), batch_size):
938
- batch_end = min(i + batch_size, len(resume_texts))
939
- batch_texts = resume_texts[i:batch_end]
940
-
941
- st.info(f"Processing batch {i//batch_size + 1}/{(len(resume_texts) + batch_size - 1)//batch_size} " +
942
- f"(resumes {i+1}-{batch_end})")
943
-
944
- # Get resume embeddings for this batch
945
- batch_embeddings = []
946
- for j, text in enumerate(batch_texts):
947
- embedding = screener.get_embedding(text)
948
- batch_embeddings.append(embedding)
949
- progress = (i + j + 1) / len(resume_texts)
950
- progress_bar.progress(progress)
951
-
952
- # Calculate semantic scores for this batch
953
- batch_semantic_scores = []
954
- for emb in batch_embeddings:
955
- # Normalize the embeddings for cosine similarity
956
- emb_norm = emb / np.linalg.norm(emb)
957
- job_emb_norm = job_embedding / np.linalg.norm(job_embedding)
958
-
959
- # Calculate cosine similarity
960
- similarity = np.dot(emb_norm, job_emb_norm)
961
- batch_semantic_scores.append(similarity)
962
-
963
- # Store scores for this batch
964
- all_semantic_scores.extend(batch_semantic_scores)
965
 
966
- # Explicitly clear GPU memory after processing each batch
967
- if torch.cuda.is_available():
968
- torch.cuda.empty_cache()
969
-
970
- # Calculate hybrid scores
971
- semantic_scores = all_semantic_scores
972
- keyword_weight = 1.0 - semantic_weight
 
 
 
973
 
974
- # Normalize BM25 scores if they're not all zeros
975
- if bm25_scores and max(bm25_scores) > 0:
976
- bm25_scores = [score / max(bm25_scores) for score in bm25_scores]
977
 
978
- # Calculate final hybrid scores
979
- hybrid_scores = [
980
- (semantic_weight * sem_score) + (keyword_weight * bm25_score)
981
- for sem_score, bm25_score in zip(semantic_scores, bm25_scores)
982
- ]
983
- else:
984
- # Regular processing for smaller datasets
985
- # Get resume embeddings
986
- resume_embeddings = []
987
- progress_bar = st.progress(0)
988
- for i, text in enumerate(resume_texts):
989
- embedding = screener.get_embedding(text)
990
- resume_embeddings.append(embedding)
991
- progress_bar.progress((i + 1) / len(resume_texts))
992
-
993
- # Calculate hybrid scores
994
- hybrid_scores, semantic_scores, bm25_scores = screener.calculate_hybrid_scores(
995
- resume_texts,
996
- resume_embeddings,
997
- job_embedding,
998
- semantic_weight,
999
- use_faiss
1000
- )
1001
-
1002
- # Get top candidates
1003
- combined_data = list(zip(file_names, resume_texts, hybrid_scores, semantic_scores, bm25_scores))
1004
- sorted_data = sorted(combined_data, key=lambda x: x[2], reverse=True)
1005
- top_candidates = sorted_data[:int(top_k)]
1006
-
1007
- # Create results with explanations if enabled
1008
- results = []
1009
- for name, text, score, semantic_score, bm25_score in top_candidates:
1010
- # Extract skills for this resume
1011
- skills = screener.extract_skills(text, job_description)
1012
 
1013
- result = {
1014
- "filename": name,
1015
- "score": score,
1016
- "semantic_score": semantic_score,
1017
- "keyword_score": bm25_score,
1018
- "text_preview": text[:500] + "...",
1019
- "matched_phrases": screener.extract_key_phrases(text, job_description),
1020
- "skills": skills
1021
- }
1022
 
1023
- if use_explanation:
1024
- explanation = screener.generate_explanation(
1025
- text,
1026
- job_description,
1027
- score,
1028
- semantic_score,
1029
- bm25_score,
1030
- skills
1031
- )
1032
- result["explanation"] = explanation
1033
- else:
1034
- result["explanation"] = ""
1035
-
1036
- results.append(result)
1037
-
1038
- st.session_state.results = results
1039
- st.success(f"Found top {len(results)} candidates!")
1040
 
1041
- # Display results
1042
  if st.session_state.results:
1043
- st.header("3. Results")
1044
 
1045
- # Create a DataFrame for download
1046
- df_data = []
1047
  for result in st.session_state.results:
1048
- df_data.append({
1049
- "Filename": result["filename"],
1050
- "Score": result["score"],
1051
- "Semantic Score": result["semantic_score"],
1052
- "Keyword Score": result["keyword_score"],
1053
- "Skills": ", ".join(result["skills"]),
1054
- "Explanation": result["explanation"]
1055
  })
1056
 
1057
- results_df = pd.DataFrame(df_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1058
 
1059
- # Display download link
1060
- st.markdown(get_csv_download_link(results_df), unsafe_allow_html=True)
1061
 
1062
- # Display individual results
1063
- for i, result in enumerate(st.session_state.results):
1064
- with st.expander(f"#{i+1}: {result['filename']} (Score: {result['score']:.4f})"):
1065
- col1, col2 = st.columns([1, 1])
 
 
1066
 
1067
  with col1:
1068
- st.subheader("Scores")
1069
- st.write(f"Total Score: {result['score']:.4f}")
1070
- st.write(f"Semantic Score: {result['semantic_score']:.4f}")
1071
- st.write(f"Keyword Score: {result['keyword_score']:.4f}")
1072
 
1073
- st.subheader("Matched Skills")
1074
- if result["skills"]:
1075
- for skill in result["skills"]:
1076
- st.write(f"• {skill}")
1077
- else:
1078
- st.write("No specific skills matched.")
1079
 
1080
  with col2:
1081
- st.subheader("Explanation")
1082
- st.write(result["explanation"])
1083
-
1084
- st.subheader("Key Matches")
1085
- for phrase in result["matched_phrases"]:
1086
- st.markdown(f"{phrase}")
1087
-
1088
- st.subheader("Resume Preview")
1089
- st.text_area("", result["text_preview"], height=150, disabled=True)
1090
-
1091
- # Visualization of scores
1092
- st.subheader("Score Comparison")
1093
-
1094
- # Prepare data for visualization
1095
- chart_data = pd.DataFrame({
1096
- "Resume": [result["filename"] for result in st.session_state.results],
1097
- "Semantic Score": [result["semantic_score"] for result in st.session_state.results],
1098
- "Keyword Score": [result["keyword_score"] for result in st.session_state.results],
1099
- "Total Score": [result["score"] for result in st.session_state.results]
1100
- })
1101
-
1102
- # Display as a bar chart
1103
- st.bar_chart(chart_data.set_index("Resume")[["Total Score", "Semantic Score", "Keyword Score"]])
 
 
 
1104
 
1105
  # Footer
1106
  st.markdown("---")
1107
- st.markdown("Built with Streamlit and Hugging Face models (NV-Embed-v2 and Qwen3-14B)")
 
 
 
 
 
 
 
 
1
  import streamlit as st
 
2
  import pandas as pd
3
  import numpy as np
4
  import torch
 
7
  import os
8
  import tempfile
9
  import base64
10
+ import re
11
+ import io
12
  from rank_bm25 import BM25Okapi
13
+ from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 
14
  from nltk.tokenize import word_tokenize, sent_tokenize
15
  from tqdm import tqdm
16
+ import pdfplumber
 
17
  import PyPDF2
18
  from docx import Document
19
  import csv
20
+ from datasets import load_dataset
21
+ import gc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  # Download NLTK resources
24
  try:
 
26
  except LookupError:
27
  nltk.download('punkt')
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  # Set page configuration
30
  st.set_page_config(
31
+ page_title="AI Resume Screener",
32
+ page_icon="🎯",
33
  layout="wide",
34
  initial_sidebar_state="expanded"
35
  )
36
 
37
+ # Sidebar configuration
38
  with st.sidebar:
39
+ st.title("⚙️ Configuration")
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  # Ranking weights
42
  st.subheader("Ranking Weights")
 
46
 
47
  # Advanced options
48
  st.subheader("Advanced Options")
49
+ top_k = st.number_input("Number of results to display", min_value=1, max_value=50, value=10, step=1)
50
+ use_explanation = st.checkbox("Generate AI Explanations", value=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  st.markdown("---")
53
+ st.markdown("### 🤖 Models Used")
54
+ st.markdown("- **Embedding**: NVIDIA NV-Embed-v2")
55
+ st.markdown("- **Explanation**: Qwen3-14B (4-bit)")
56
+ st.markdown("### 📊 About")
57
+ st.markdown("This app uses hybrid ranking combining semantic similarity with keyword matching to find the best candidates for job positions.")
58
 
59
+ # Initialize session state
60
+ if 'embedding_model' not in st.session_state:
61
+ st.session_state.embedding_model = None
62
+ if 'explanation_model' not in st.session_state:
63
+ st.session_state.explanation_model = None
64
  if 'results' not in st.session_state:
65
  st.session_state.results = []
 
 
 
 
 
 
 
 
66
 
67
+ @st.cache_resource
68
+ def load_embedding_model():
69
+ """Load and cache the embedding model"""
70
+ try:
71
+ with st.spinner("🔄 Loading NVIDIA NV-Embed-v2 model..."):
72
+ tokenizer = AutoTokenizer.from_pretrained("nvidia/NV-Embed-v2", trust_remote_code=True)
73
+ model = AutoModel.from_pretrained(
74
+ "nvidia/NV-Embed-v2",
75
+ trust_remote_code=True,
76
+ device_map="auto",
77
+ torch_dtype=torch.float16
78
+ )
79
+ st.success("✅ Embedding model loaded successfully!")
80
+ return model, tokenizer
81
+ except Exception as e:
82
+ st.error(f"❌ Error loading embedding model: {str(e)}")
83
+ return None, None
84
+
85
+ @st.cache_resource
86
+ def load_explanation_model():
87
+ """Load and cache the explanation model with quantization"""
88
+ if not use_explanation:
89
+ return None, None
90
 
91
+ try:
92
+ with st.spinner("🔄 Loading Qwen3-14B model with 4-bit quantization..."):
93
+ # Configure 4-bit quantization
94
+ quantization_config = BitsAndBytesConfig(
95
+ load_in_4bit=True,
96
+ bnb_4bit_quant_type="nf4",
97
+ bnb_4bit_compute_dtype=torch.float16,
98
+ bnb_4bit_use_double_quant=True
99
+ )
100
+
101
+ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-14B-Instruct", trust_remote_code=True)
102
+ model = AutoModelForCausalLM.from_pretrained(
103
+ "Qwen/Qwen2.5-14B-Instruct",
104
+ quantization_config=quantization_config,
105
+ device_map="auto",
106
+ trust_remote_code=True,
107
+ torch_dtype=torch.float16
108
+ )
109
+ st.success("✅ Explanation model loaded successfully!")
110
+ return model, tokenizer
111
+ except Exception as e:
112
+ st.error(f"❌ Error loading explanation model: {str(e)}")
113
+ return None, None
114
+
115
+ class ResumeScreener:
116
+ def __init__(self):
117
+ # Load models
118
+ self.embedding_model, self.embedding_tokenizer = load_embedding_model()
119
+ if use_explanation:
120
+ self.explanation_model, self.explanation_tokenizer = load_explanation_model()
121
+ else:
122
+ self.explanation_model, self.explanation_tokenizer = None, None
123
 
124
+ def extract_text_from_file(self, file_path, file_type):
125
  """Extract text from various file types"""
126
  try:
127
  if file_type == "pdf":
128
+ with open(file_path, 'rb') as file:
129
+ with pdfplumber.open(file) as pdf:
 
 
 
 
 
 
 
130
  text = ""
131
+ for page in pdf.pages:
 
132
  text += page.extract_text() or ""
133
+
134
+ if not text.strip():
135
+ # Fallback to PyPDF2
136
+ file.seek(0)
137
+ reader = PyPDF2.PdfReader(file)
138
+ text = ""
139
+ for page in reader.pages:
140
+ text += page.extract_text() or ""
141
+ return text
142
+
143
  elif file_type == "docx":
144
+ doc = Document(file_path)
145
  return " ".join([paragraph.text for paragraph in doc.paragraphs])
146
 
147
  elif file_type == "txt":
148
+ with open(file_path, 'r', encoding='utf-8') as file:
149
+ return file.read()
150
+
151
  elif file_type == "csv":
152
+ with open(file_path, 'r', encoding='utf-8') as file:
153
+ csv_reader = csv.reader(file)
154
+ return " ".join([" ".join(row) for row in csv_reader])
155
+
 
 
 
 
 
 
156
  except Exception as e:
157
+ st.error(f"Error extracting text from {file_path}: {str(e)}")
158
  return ""
159
 
160
  def get_embedding(self, text):
161
+ """Generate embedding for text"""
162
+ if self.embedding_model is None:
163
+ return np.zeros(4096) # NV-Embed-v2 dimension
 
164
 
165
  try:
166
+ # Truncate text to avoid memory issues
167
+ text = text[:8192] # Reasonable limit for NV-Embed-v2
168
+
169
+ inputs = self.embedding_tokenizer(
170
+ text,
171
+ return_tensors="pt",
172
+ truncation=True,
173
+ max_length=512,
174
+ padding=True
175
+ )
176
 
177
+ # Move to same device as model
178
+ device = next(self.embedding_model.parameters()).device
179
  inputs = {k: v.to(device) for k, v in inputs.items()}
180
 
181
  with torch.no_grad():
182
+ outputs = self.embedding_model(**inputs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
+ # Extract embeddings - NV-Embed-v2 specific
185
+ if hasattr(outputs, 'pooler_output'):
186
+ embeddings = outputs.pooler_output
187
+ elif hasattr(outputs, 'last_hidden_state'):
188
+ embeddings = outputs.last_hidden_state.mean(dim=1)
 
 
 
 
 
 
 
 
 
189
  else:
190
+ embeddings = outputs[0].mean(dim=1)
 
 
 
 
 
 
 
 
 
 
 
191
 
192
+ return embeddings.cpu().numpy().squeeze()
193
+
 
 
194
  except Exception as e:
195
  st.error(f"Error generating embedding: {str(e)}")
196
+ return np.zeros(4096)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
 
198
  def calculate_bm25_scores(self, resume_texts, job_description):
199
  """Calculate BM25 scores for keyword matching"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  try:
201
+ job_tokens = word_tokenize(job_description.lower())
202
+ corpus = [word_tokenize(text.lower()) for text in resume_texts if text.strip()]
203
+
204
+ if not corpus:
205
+ return [0.0] * len(resume_texts)
206
+
207
+ bm25 = BM25Okapi(corpus)
208
+ scores = bm25.get_scores(job_tokens)
209
+ return scores.tolist()
210
+
 
 
 
 
 
 
 
 
211
  except Exception as e:
212
+ st.error(f"Error calculating BM25 scores: {str(e)}")
213
  return [0.0] * len(resume_texts)
214
 
215
+ def calculate_hybrid_scores(self, resume_texts, job_description):
216
+ """Calculate hybrid scores combining semantic and keyword matching"""
217
+ # Get job embedding
218
+ job_embedding = self.get_embedding(job_description)
219
+
220
+ # Get resume embeddings
221
+ resume_embeddings = []
222
+ progress_bar = st.progress(0)
223
+ for i, text in enumerate(resume_texts):
224
+ embedding = self.get_embedding(text)
225
+ resume_embeddings.append(embedding)
226
+ progress_bar.progress((i + 1) / len(resume_texts))
227
+
228
+ # Calculate semantic scores (cosine similarity)
229
+ semantic_scores = []
230
+ for resume_emb in resume_embeddings:
231
+ job_norm = job_embedding / (np.linalg.norm(job_embedding) + 1e-8)
232
+ resume_norm = resume_emb / (np.linalg.norm(resume_emb) + 1e-8)
233
+ similarity = np.dot(job_norm, resume_norm)
234
+ semantic_scores.append(float(similarity))
 
 
 
 
 
 
 
 
 
 
235
 
236
  # Calculate BM25 scores
237
  bm25_scores = self.calculate_bm25_scores(resume_texts, job_description)
238
 
239
  # Normalize BM25 scores
240
+ if bm25_scores and max(bm25_scores) > 0:
241
+ max_bm25 = max(bm25_scores)
242
+ bm25_scores = [score / max_bm25 for score in bm25_scores]
243
 
244
  # Calculate hybrid scores
 
245
  hybrid_scores = [
246
  (semantic_weight * sem_score) + (keyword_weight * bm25_score)
247
  for sem_score, bm25_score in zip(semantic_scores, bm25_scores)
 
250
  return hybrid_scores, semantic_scores, bm25_scores
251
 
252
  def extract_skills(self, text, job_description):
253
+ """Extract skills from resume based on job description"""
254
+ # Common tech skills and job-related terms
255
+ common_skills = [
256
+ "python", "java", "javascript", "react", "node.js", "sql", "html", "css",
257
+ "aws", "azure", "docker", "kubernetes", "git", "agile", "scrum", "ci/cd",
258
+ "machine learning", "data science", "artificial intelligence", "tensorflow",
259
+ "pytorch", "pandas", "numpy", "scikit-learn", "mysql", "postgresql",
260
+ "mongodb", "redis", "elasticsearch", "spark", "hadoop", "tableau", "powerbi"
261
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
 
263
+ # Extract skills from job description
264
+ job_words = set(word.lower() for word in word_tokenize(job_description) if len(word) > 2)
 
 
 
 
 
265
 
266
+ # Find matching skills
267
+ found_skills = []
268
+ text_lower = text.lower()
 
269
 
270
+ # Check common skills
271
+ for skill in common_skills:
272
+ if skill in text_lower and skill in " ".join(job_words):
273
+ found_skills.append(skill)
274
 
275
+ # Check job-specific terms
276
+ for word in job_words:
277
+ if len(word) > 3 and word in text_lower:
278
+ found_skills.append(word)
 
 
 
 
279
 
280
+ return list(set(found_skills))[:10] # Return top 10 unique skills
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
 
282
  def generate_explanation(self, resume_text, job_description, score, semantic_score, bm25_score, skills):
283
+ """Generate explanation using Qwen model"""
284
+ if self.explanation_model is None or self.explanation_tokenizer is None:
285
+ return self._generate_simple_explanation(score, semantic_score, bm25_score, skills)
286
+
287
+ try:
288
+ # Create prompt
289
+ prompt = f"""As a recruitment AI assistant, explain why this resume scored {score:.2f} for the given job position.
290
+
291
+ Job Requirements:
292
+ {job_description[:500]}...
293
+
294
+ Resume Summary:
295
+ {resume_text[:800]}...
296
+
297
+ Scores:
298
+ - Overall: {score:.2f}/1.0
299
+ - Semantic Match: {semantic_score:.2f}/1.0
300
+ - Keyword Match: {bm25_score:.2f}/1.0
301
+ - Key Skills: {', '.join(skills[:5])}
302
+
303
+ Provide a concise 2-3 sentence explanation of the match quality and key strengths."""
304
+
305
+ # Generate response
306
+ messages = [{"role": "user", "content": prompt}]
307
+ text = self.explanation_tokenizer.apply_chat_template(
308
+ messages, tokenize=False, add_generation_prompt=True
309
  )
 
 
 
310
 
311
+ inputs = self.explanation_tokenizer(text, return_tensors="pt").to(self.explanation_model.device)
312
 
313
+ with torch.no_grad():
314
+ outputs = self.explanation_model.generate(
315
+ **inputs,
316
+ max_new_tokens=150,
317
+ temperature=0.7,
318
+ do_sample=True,
319
+ pad_token_id=self.explanation_tokenizer.eos_token_id
320
+ )
321
+
322
+ response = self.explanation_tokenizer.decode(
323
+ outputs[0][inputs.input_ids.shape[1]:],
324
+ skip_special_tokens=True
325
+ )
326
 
327
+ return response.strip()[:400] # Limit length
 
328
 
329
+ except Exception as e:
330
+ st.warning(f"AI explanation failed: {str(e)}")
331
+ return self._generate_simple_explanation(score, semantic_score, bm25_score, skills)
332
+
333
+ def _generate_simple_explanation(self, score, semantic_score, bm25_score, skills):
334
+ """Fallback explanation generation"""
335
+ if score > 0.8:
336
+ quality = "excellent"
337
+ elif score > 0.6:
338
+ quality = "good"
339
+ elif score > 0.4:
340
+ quality = "moderate"
341
+ else:
342
+ quality = "limited"
343
+
344
+ explanation = f"This resume shows {quality} alignment with the job requirements (score: {score:.2f}). "
345
+
346
+ if semantic_score > bm25_score:
347
+ explanation += f"Strong conceptual match ({semantic_score:.2f}) with relevant experience. "
348
+ else:
349
+ explanation += f"Good keyword coverage ({bm25_score:.2f}) of job requirements. "
350
+
351
+ if skills:
352
+ explanation += f"Key matching skills: {', '.join(skills[:3])}."
353
+
354
+ return explanation
355
 
356
+ def create_download_link(df, filename="resume_screening_results.csv"):
357
+ """Create download link for results"""
358
  csv = df.to_csv(index=False)
359
  b64 = base64.b64encode(csv.encode()).decode()
360
+ return f'<a href="data:file/csv;base64,{b64}" download="{filename}" class="download-btn">📥 Download Results CSV</a>'
 
361
 
362
+ # Main App Interface
363
+ st.title("🎯 AI-Powered Resume Screener")
364
+ st.markdown("*Find the perfect candidates using advanced AI matching*")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
365
  st.markdown("---")
366
 
367
+ # Initialize screener
368
+ if st.session_state.embedding_model is None:
369
+ screener = ResumeScreener()
370
+ st.session_state.embedding_model = screener.embedding_model
371
+ st.session_state.explanation_model = screener.explanation_model
372
+ else:
373
+ screener = ResumeScreener()
374
 
375
+ # Job Description Input
376
+ st.header("📝 Step 1: Enter Job Description")
377
  job_description = st.text_area(
378
+ "Enter the complete job description or requirements:",
379
+ height=150,
380
+ placeholder="Paste the job description here, including required skills, experience, and qualifications..."
381
  )
382
 
383
+ # Resume Input Options
384
+ st.header("📄 Step 2: Upload Resumes")
385
+
386
+ input_method = st.radio(
387
+ "Choose input method:",
388
+ ["📁 Upload Files", "🗂️ Load from CSV Dataset", "🔗 Load from Hugging Face Dataset"]
389
  )
390
 
 
391
  resume_texts = []
392
  file_names = []
393
 
394
+ if input_method == "📁 Upload Files":
395
  uploaded_files = st.file_uploader(
396
  "Upload resume files",
397
+ type=["pdf", "docx", "txt"],
398
  accept_multiple_files=True,
399
+ help="Supported formats: PDF, DOCX, TXT"
400
  )
401
 
402
  if uploaded_files:
403
+ with st.spinner(f"🔄 Processing {len(uploaded_files)} files..."):
404
  for file in uploaded_files:
405
  file_type = file.name.split('.')[-1].lower()
406
 
407
+ # Save temporary file
408
  with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{file_type}') as tmp_file:
409
  tmp_file.write(file.getvalue())
410
  tmp_path = tmp_file.name
411
 
412
+ # Extract text
413
  text = screener.extract_text_from_file(tmp_path, file_type)
414
+ if text.strip():
415
  resume_texts.append(text)
416
  file_names.append(file.name)
417
 
418
+ # Cleanup
419
  os.unlink(tmp_path)
420
+
421
+ if resume_texts:
422
+ st.success(f"Successfully processed {len(resume_texts)} resumes")
423
+
424
+ elif input_method == "🗂️ Load from CSV Dataset":
425
+ csv_file = st.file_uploader("Upload CSV file with resume data", type=["csv"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
426
 
427
+ if csv_file:
428
+ try:
429
+ df = pd.read_csv(csv_file)
430
+ st.write("**CSV Preview:**")
431
+ st.dataframe(df.head())
 
 
 
 
 
432
 
433
+ text_column = st.selectbox(
434
+ "Select column containing resume text:",
435
+ df.columns.tolist()
436
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
437
 
438
+ name_column = st.selectbox(
439
+ "Select column for candidate names/IDs (optional):",
440
+ ["Use Index"] + df.columns.tolist()
441
+ )
442
+
443
+ if st.button("🚀 Process CSV Data"):
444
+ with st.spinner("🔄 Processing CSV data..."):
445
+ for idx, row in df.iterrows():
446
+ text = str(row[text_column])
447
+ if text and text.strip() and text.lower() != 'nan':
448
+ resume_texts.append(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
 
450
+ if name_column == "Use Index":
451
+ file_names.append(f"Resume_{idx}")
452
+ else:
453
+ file_names.append(str(row[name_column]))
 
454
 
455
+ if resume_texts:
456
+ st.success(f"✅ Successfully loaded {len(resume_texts)} resumes from CSV")
 
457
 
458
+ except Exception as e:
459
+ st.error(f"❌ Error processing CSV: {str(e)}")
460
+
461
+ elif input_method == "🔗 Load from Hugging Face Dataset":
462
+ st.markdown("**Quick Load:** [Resume Atlas Dataset](https://huggingface.co/datasets/ahmedheakl/resume-atlas)")
463
+
464
+ col1, col2 = st.columns([2, 1])
465
+ with col1:
466
+ dataset_name = st.text_input(
467
+ "Dataset name:",
468
+ value="ahmedheakl/resume-atlas",
469
+ help="Enter Hugging Face dataset name"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
470
  )
471
+ with col2:
472
+ dataset_split = st.selectbox("Split:", ["train", "test", "validation"], index=0)
473
+
474
+ if st.button("🔗 Load from Hugging Face"):
475
+ try:
476
+ with st.spinner(f"🔄 Loading {dataset_name}..."):
477
+ dataset = load_dataset(dataset_name, split=dataset_split)
478
 
479
+ st.success(f"✅ Loaded dataset with {len(dataset)} entries")
480
+ st.write("**Dataset Preview:**")
481
+
482
+ # Show first few examples
483
+ preview_df = pd.DataFrame(dataset[:5])
484
+ st.dataframe(preview_df)
485
+
486
+ # Column selection
487
+ text_column = st.selectbox(
488
+ "Select column with resume text:",
489
+ dataset.column_names,
490
+ index=0 if 'resume_text' in dataset.column_names else 0
491
+ )
492
+
493
+ category_column = None
494
+ if 'category' in dataset.column_names:
495
+ category_column = st.selectbox(
496
+ "Filter by category (optional):",
497
+ ["All"] + list(set(dataset['category']))
498
  )
499
+
500
+ max_samples = st.slider("Maximum samples to load:", 10, min(1000, len(dataset)), 100)
501
+
502
+ if st.button("🚀 Process Dataset"):
503
+ with st.spinner("🔄 Processing dataset..."):
504
+ filtered_dataset = dataset
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
505
 
506
+ # Apply category filter
507
+ if category_column and category_column != "All":
508
+ filtered_dataset = dataset.filter(lambda x: x['category'] == category_column)
509
 
510
+ # Limit samples
511
+ sample_indices = list(range(min(max_samples, len(filtered_dataset))))
512
 
513
+ for idx in sample_indices:
514
+ item = filtered_dataset[idx]
515
+ text = str(item[text_column])
 
 
 
516
 
517
+ if text and text.strip() and text.lower() != 'nan':
518
+ resume_texts.append(text)
 
 
 
 
 
 
 
 
 
 
519
 
520
+ # Use ID or index for naming
521
+ if 'id' in item:
522
+ file_names.append(f"Resume_{item['id']}")
523
+ else:
524
+ file_names.append(f"Resume_{idx}")
525
+
526
+ if resume_texts:
527
+ st.success(f"✅ Successfully loaded {len(resume_texts)} resumes")
528
+
529
+ except Exception as e:
530
+ st.error(f"❌ Error loading dataset: {str(e)}")
531
 
532
+ # Processing and Results
533
+ if st.button("🔍 Find Best Candidates", disabled=not (job_description and resume_texts)):
534
+ if len(resume_texts) == 0:
535
+ st.error("❌ Please upload resumes first!")
536
+ elif not job_description.strip():
537
+ st.error("❌ Please enter a job description!")
538
+ else:
539
+ with st.spinner("🧠 AI is analyzing resumes..."):
540
+ # Calculate scores
541
+ hybrid_scores, semantic_scores, bm25_scores = screener.calculate_hybrid_scores(
542
+ resume_texts, job_description
543
+ )
 
 
 
 
 
 
 
 
 
544
 
545
+ # Prepare results
546
+ results = []
547
+ for i, (name, text, hybrid_score, semantic_score, bm25_score) in enumerate(
548
+ zip(file_names, resume_texts, hybrid_scores, semantic_scores, bm25_scores)
549
+ ):
550
+ # Extract skills
551
+ skills = screener.extract_skills(text, job_description)
552
+
553
+ # Generate explanation
554
+ explanation = ""
555
+ if use_explanation:
556
+ explanation = screener.generate_explanation(
557
+ text, job_description, hybrid_score, semantic_score, bm25_score, skills
558
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
559
 
560
+ results.append({
561
+ 'rank': i + 1,
562
+ 'name': name,
563
+ 'score': hybrid_score,
564
+ 'semantic_score': semantic_score,
565
+ 'keyword_score': bm25_score,
566
+ 'skills': skills,
567
+ 'explanation': explanation,
568
+ 'text_preview': text[:300] + "..." if len(text) > 300 else text
569
+ })
570
 
571
+ # Sort by score
572
+ results.sort(key=lambda x: x['score'], reverse=True)
 
573
 
574
+ # Update ranks
575
+ for i, result in enumerate(results):
576
+ result['rank'] = i + 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
577
 
578
+ # Store in session state
579
+ st.session_state.results = results[:top_k]
 
 
 
 
 
 
 
580
 
581
+ st.success(f"🎉 Analysis complete! Found top {len(st.session_state.results)} candidates")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
582
 
583
+ # Display Results
584
  if st.session_state.results:
585
+ st.header("🏆 Top Candidates")
586
 
587
+ # Create summary dataframe
588
+ summary_data = []
589
  for result in st.session_state.results:
590
+ summary_data.append({
591
+ "Rank": result['rank'],
592
+ "Candidate": result['name'],
593
+ "Overall Score": f"{result['score']:.3f}",
594
+ "Semantic Score": f"{result['semantic_score']:.3f}",
595
+ "Keyword Score": f"{result['keyword_score']:.3f}",
596
+ "Key Skills": ", ".join(result['skills'][:3]) + ("..." if len(result['skills']) > 3 else ""),
597
  })
598
 
599
+ summary_df = pd.DataFrame(summary_data)
600
+ st.dataframe(summary_df, use_container_width=True)
601
+
602
+ # Download link
603
+ detailed_data = []
604
+ for result in st.session_state.results:
605
+ detailed_data.append({
606
+ "Rank": result['rank'],
607
+ "Candidate": result['name'],
608
+ "Overall_Score": result['score'],
609
+ "Semantic_Score": result['semantic_score'],
610
+ "Keyword_Score": result['keyword_score'],
611
+ "Skills": "; ".join(result['skills']),
612
+ "Explanation": result['explanation'],
613
+ "Resume_Preview": result['text_preview']
614
+ })
615
 
616
+ download_df = pd.DataFrame(detailed_data)
617
+ st.markdown(create_download_link(download_df), unsafe_allow_html=True)
618
 
619
+ # Detailed results
620
+ st.subheader("📋 Detailed Analysis")
621
+
622
+ for result in st.session_state.results:
623
+ with st.expander(f"🥇 #{result['rank']}: {result['name']} (Score: {result['score']:.3f})"):
624
+ col1, col2 = st.columns([1, 2])
625
 
626
  with col1:
627
+ st.metric("Overall Score", f"{result['score']:.3f}")
628
+ st.metric("Semantic Match", f"{result['semantic_score']:.3f}")
629
+ st.metric("Keyword Match", f"{result['keyword_score']:.3f}")
 
630
 
631
+ st.write("**🎯 Key Skills:**")
632
+ for skill in result['skills'][:8]:
633
+ st.write(f"• {skill}")
 
 
 
634
 
635
  with col2:
636
+ if result['explanation']:
637
+ st.write("**🤖 AI Analysis:**")
638
+ st.info(result['explanation'])
639
+
640
+ st.write("**📄 Resume Preview:**")
641
+ st.text_area("", result['text_preview'], height=150, disabled=True, key=f"preview_{result['rank']}")
642
+
643
+ # Score visualization
644
+ if len(st.session_state.results) > 1:
645
+ st.subheader("📊 Score Visualization")
646
+
647
+ chart_data = pd.DataFrame({
648
+ 'Candidate': [r['name'] for r in st.session_state.results],
649
+ 'Overall Score': [r['score'] for r in st.session_state.results],
650
+ 'Semantic Score': [r['semantic_score'] for r in st.session_state.results],
651
+ 'Keyword Score': [r['keyword_score'] for r in st.session_state.results]
652
+ })
653
+
654
+ st.bar_chart(chart_data.set_index('Candidate'))
655
+
656
+ # Memory cleanup
657
+ if st.button("🧹 Clear Memory"):
658
+ if torch.cuda.is_available():
659
+ torch.cuda.empty_cache()
660
+ gc.collect()
661
+ st.success("✅ Memory cleared!")
662
 
663
  # Footer
664
  st.markdown("---")
665
+ st.markdown(
666
+ """
667
+ <div style='text-align: center; color: #666;'>
668
+ 🚀 Powered by NVIDIA NV-Embed-v2 & Qwen3-14B | Built with Streamlit
669
+ </div>
670
+ """,
671
+ unsafe_allow_html=True
672
+ )
explanation_generator.py DELETED
@@ -1,223 +0,0 @@
1
- """
2
- Explanation Generator Module
3
-
4
- This module handles the generation of explanations for resume rankings
5
- using the Qwen3-14B model from Hugging Face.
6
- """
7
-
8
- import torch
9
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
10
- import os
11
- import re
12
- import sys
13
-
14
- # Use the alternative model loading approach
15
- try:
16
- # Try to import the functions from alt_models.py
17
- from alt_models import load_explanation_model
18
- USE_ALT_MODELS = True
19
- except ImportError:
20
- USE_ALT_MODELS = False
21
- # If import fails, we'll use the original approach
22
- # Add Replicate class workaround if not already defined
23
- try:
24
- from transformers.models.qwen2.modeling_qwen2 import Replicate
25
- except (ImportError, AttributeError):
26
- class Replicate(torch.nn.Module):
27
- """Workaround class for missing Replicate in Qwen models"""
28
- def __init__(self, module, num_replicas=1):
29
- super().__init__()
30
- self.module = module
31
- self.num_replicas = num_replicas
32
-
33
- def forward(self, *args, **kwargs):
34
- return self.module(*args, **kwargs)
35
-
36
- # Create module structure if it doesn't exist yet
37
- parent_modules = [
38
- "transformers.models",
39
- "transformers.models.qwen2",
40
- ]
41
-
42
- # Create all parent modules
43
- for module_path in parent_modules:
44
- if module_path not in sys.modules:
45
- sys.modules[module_path] = type('', (), {})
46
-
47
- # Create and add the Replicate class
48
- if "transformers.models.qwen2.modeling_qwen2" not in sys.modules:
49
- sys.modules["transformers.models.qwen2.modeling_qwen2"] = type('', (), {})
50
- sys.modules["transformers.models.qwen2.modeling_qwen2"].Replicate = Replicate
51
-
52
- # Load Qwen3 model at initialization time
53
- print("Loading Qwen/Qwen3-14B model with 4-bit quantization...")
54
- QWEN_MODEL_NAME = "Qwen/Qwen3-14B"
55
-
56
- if USE_ALT_MODELS:
57
- # Use the alternative loading approach
58
- global_qwen_model, global_qwen_tokenizer = load_explanation_model(QWEN_MODEL_NAME)
59
- else:
60
- # Use original approach
61
- try:
62
- # Configure 4-bit quantization for better performance
63
- quantization_config = BitsAndBytesConfig(
64
- load_in_4bit=True,
65
- bnb_4bit_quant_type="nf4",
66
- bnb_4bit_compute_dtype=torch.float16,
67
- bnb_4bit_use_double_quant=True
68
- )
69
-
70
- # Load Qwen3 model and tokenizer
71
- global_qwen_tokenizer = AutoTokenizer.from_pretrained(QWEN_MODEL_NAME, trust_remote_code=True)
72
- global_qwen_model = None
73
-
74
- # Check if we have enough resources to load the model
75
- if torch.cuda.is_available():
76
- gpu_memory = torch.cuda.get_device_properties(0).total_memory
77
- if gpu_memory >= 12 * (1024**3): # 12 GB (reduced memory requirement compared to 32B model)
78
- global_qwen_model = AutoModelForCausalLM.from_pretrained(
79
- QWEN_MODEL_NAME,
80
- quantization_config=quantization_config,
81
- device_map="auto",
82
- trust_remote_code=True,
83
- torch_dtype=torch.float16
84
- )
85
- print("Successfully loaded Qwen3-14B with 4-bit quantization")
86
- else:
87
- print("Not enough GPU memory, using template-based explanations")
88
- else:
89
- print("CUDA not available, using template-based explanations")
90
-
91
- except Exception as e:
92
- print(f"Error loading Qwen3-14B model: {str(e)}")
93
- print("Falling back to template-based explanations.")
94
- global_qwen_tokenizer = None
95
- global_qwen_model = None
96
-
97
- class ExplanationGenerator:
98
- def __init__(self, model_name="Qwen/Qwen3-14B"):
99
- """Initialize the explanation generator with the specified model"""
100
- self.model_name = model_name
101
- # Use globally pre-loaded model and tokenizer
102
- self.model = global_qwen_model
103
- self.tokenizer = global_qwen_tokenizer
104
- self.initialized = True
105
-
106
- def generate_explanation(self, resume_text, job_description, score, semantic_score, keyword_score, skills):
107
- """Generate explanation for why a resume was ranked highly"""
108
- # Use the model if it's available
109
- if self.model is not None and self.tokenizer is not None:
110
- try:
111
- # Prepare prompt for Qwen3-14B
112
- prompt = self._create_prompt(resume_text, job_description, score, semantic_score, keyword_score, skills)
113
-
114
- # Create messages for chat format
115
- messages = [
116
- {"role": "user", "content": prompt}
117
- ]
118
-
119
- # Apply chat template with thinking mode enabled
120
- text = self.tokenizer.apply_chat_template(
121
- messages,
122
- tokenize=False,
123
- add_generation_prompt=True,
124
- enable_thinking=True
125
- )
126
-
127
- # Tokenize
128
- inputs = self.tokenizer(text, return_tensors="pt").to(self.model.device)
129
-
130
- # Generate response with recommended parameters for thinking mode
131
- output_ids = self.model.generate(
132
- **inputs,
133
- max_new_tokens=500,
134
- temperature=0.6,
135
- top_p=0.95,
136
- top_k=20
137
- )
138
-
139
- # Decode the response
140
- response = self.tokenizer.decode(output_ids[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
141
-
142
- # Clean up the response
143
- cleaned_response = self._clean_response(response)
144
-
145
- return cleaned_response
146
-
147
- except Exception as e:
148
- print(f"Error generating explanation with Qwen3-14B: {str(e)}")
149
- # Fall back to template-based explanation
150
- return self._generate_template_explanation(score, semantic_score, keyword_score, skills)
151
- else:
152
- # Use template-based explanation if model is not available
153
- return self._generate_template_explanation(score, semantic_score, keyword_score, skills)
154
-
155
- def _create_prompt(self, resume_text, job_description, score, semantic_score, keyword_score, skills):
156
- """Create a prompt for the explanation generation"""
157
- # Use only the first 1000 characters of the resume to keep prompt size manageable
158
- resume_excerpt = resume_text[:1000] + "..." if len(resume_text) > 1000 else resume_text
159
-
160
- prompt = f"""You are an AI assistant helping a recruiter understand why a candidate's resume was matched with a job posting.
161
-
162
- The resume has been assigned the following scores:
163
- - Overall Match Score: {score:.2f} out of 1.0
164
- - Semantic Relevance Score: {semantic_score:.2f} out of 1.0
165
- - Keyword Match Score: {keyword_score:.2f} out of 1.0
166
-
167
- The job description is:
168
- ```
169
- {job_description}
170
- ```
171
-
172
- Based on analysis, the resume contains these skills relevant to the job: {', '.join(skills)}
173
-
174
- Resume excerpt:
175
- ```
176
- {resume_excerpt}
177
- ```
178
-
179
- Please provide a short explanation (3-5 sentences) of why this resume received these scores and how well it matches the job requirements. Focus on the relationship between the candidate's experience and the job requirements."""
180
-
181
- return prompt
182
-
183
- def _clean_response(self, response):
184
- """Clean the response from the model"""
185
- # Remove any thinking or internal processing tokens
186
- response = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL)
187
-
188
- # Limit to a reasonable length
189
- if len(response) > 500:
190
- sentences = response.split('.')
191
- shortened = '.'.join(sentences[:5]) + '.'
192
- return shortened
193
-
194
- return response
195
-
196
- def _generate_template_explanation(self, score, semantic_score, keyword_score, skills):
197
- """Generate a template-based explanation when the model is not available"""
198
- # Simple template-based explanation
199
- if score > 0.8:
200
- quality = "excellent"
201
- elif score > 0.6:
202
- quality = "good"
203
- elif score > 0.4:
204
- quality = "moderate"
205
- else:
206
- quality = "limited"
207
-
208
- explanation = f"This resume shows {quality} alignment with the job requirements, with an overall score of {score:.2f}. "
209
-
210
- if semantic_score > keyword_score:
211
- explanation += f"The candidate's experience demonstrates strong semantic relevance ({semantic_score:.2f}) to the position, though specific keyword matches ({keyword_score:.2f}) could be improved. "
212
- else:
213
- explanation += f"The resume contains many relevant keywords ({keyword_score:.2f}), but could benefit from better contextual alignment ({semantic_score:.2f}) with the job requirements. "
214
-
215
- if skills:
216
- if len(skills) > 3:
217
- explanation += f"Key skills identified include {', '.join(skills[:3])}, and {len(skills)-3} others that match the job requirements."
218
- else:
219
- explanation += f"Key skills identified include {', '.join(skills)}."
220
- else:
221
- explanation += "No specific skills were identified that directly match the requirements."
222
-
223
- return explanation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fix_dependencies.py DELETED
@@ -1,76 +0,0 @@
1
- #!/usr/bin/env python
2
- """
3
- Dependency fixer for Resume Screener and Skill Extractor
4
- This script ensures all dependencies are properly installed with compatible versions.
5
- """
6
-
7
- import sys
8
- import subprocess
9
- import pkg_resources
10
- import os
11
-
12
- def install(package):
13
- """Install a package using pip"""
14
- subprocess.check_call([sys.executable, "-m", "pip", "install", package])
15
-
16
- def install_with_message(package, message=None):
17
- """Install a package with an optional message"""
18
- if message:
19
- print(f"\n{message}")
20
- print(f"Installing {package}...")
21
- install(package)
22
-
23
- def main():
24
- print("Running dependency fixer for Resume Screener and Skill Extractor...")
25
-
26
- # Install core dependencies first
27
- install_with_message("pip==23.1.2", "Upgrading pip to ensure compatibility")
28
- install_with_message("setuptools==68.0.0", "Installing compatible setuptools")
29
-
30
- # Check if we're in a Hugging Face Space
31
- in_hf_space = os.environ.get("SPACE_ID") is not None
32
-
33
- # Install key libraries with specific versions to ensure compatibility
34
- dependencies = [
35
- ("streamlit==1.31.0", "Installing Streamlit for the web interface"),
36
- ("pdfplumber==0.10.1", "Installing PDF processing libraries"),
37
- ("PyPDF2==3.0.1", None),
38
- ("python-docx==1.0.1", None),
39
- ("rank-bm25==0.2.2", "Installing BM25 ranking library"),
40
- ("tqdm==4.66.1", "Installing progress bar utility"),
41
- ("faiss-cpu==1.7.4", "Installing FAISS for vector similarity search"),
42
- ("huggingface-hub==0.20.3", "Installing Hugging Face Hub"),
43
- ("transformers==4.36.2", "Installing Transformers"),
44
- ("sentence-transformers==2.2.2", "Installing Sentence Transformers"),
45
- ("torch==2.1.2", "Installing PyTorch"),
46
- ("nltk==3.8.1", "Installing NLTK for text processing"),
47
- ("pandas==2.1.3", "Installing data processing libraries"),
48
- ("numpy==1.24.3", None),
49
- ("plotly==5.18.0", "Installing visualization libraries"),
50
- ("spacy==3.7.2", "Installing spaCy for NLP"),
51
- ]
52
-
53
- # Install all dependencies
54
- for package, message in dependencies:
55
- install_with_message(package, message)
56
-
57
- # Download required NLTK data
58
- print("\nDownloading NLTK data...")
59
- install("nltk")
60
- import nltk
61
- nltk.download('punkt')
62
-
63
- # Download spaCy model if not in a Hugging Face Space
64
- # (Spaces should include this in the requirements.txt)
65
- if not in_hf_space:
66
- print("\nDownloading spaCy model...")
67
- try:
68
- subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
69
- except:
70
- install("https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0.tar.gz")
71
-
72
- print("\nDependency installation complete!")
73
- print("You can now run the Resume Screener with: streamlit run app.py")
74
-
75
- if __name__ == "__main__":
76
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,22 +1,19 @@
1
  streamlit==1.31.0
 
 
2
  pdfplumber==0.10.1
3
  PyPDF2==3.0.1
4
  python-docx==1.0.1
5
- spacy==3.7.2
6
- https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0.tar.gz
7
- transformers==4.48.0
8
- torch==2.1.2
9
  nltk==3.8.1
10
  faiss-cpu==1.7.4
11
  rank-bm25==0.2.2
12
- sentence-transformers==2.7.0
13
- plotly==5.18.0
14
  pandas==2.1.3
15
  numpy==1.24.3
16
  tqdm==4.66.1
17
  huggingface-hub==0.27.1
18
- einops
19
- bitsandbytes>=0.41.0
20
- accelerate>=0.23.0
21
- optimum>=1.13.1
22
- safetensors>=0.3.1
 
 
1
  streamlit==1.31.0
2
+ transformers==4.48.0
3
+ torch==2.1.2
4
  pdfplumber==0.10.1
5
  PyPDF2==3.0.1
6
  python-docx==1.0.1
 
 
 
 
7
  nltk==3.8.1
8
  faiss-cpu==1.7.4
9
  rank-bm25==0.2.2
 
 
10
  pandas==2.1.3
11
  numpy==1.24.3
12
  tqdm==4.66.1
13
  huggingface-hub==0.27.1
14
+ bitsandbytes==0.44.1
15
+ accelerate==0.27.2
16
+ datasets==2.18.0
17
+ sentence-transformers==2.7.0
18
+ plotly==5.18.0
19
+ base64