root commited on
Commit
6ee771e
·
1 Parent(s): 5f638fd
Files changed (1) hide show
  1. app.py +86 -12
app.py CHANGED
@@ -189,7 +189,24 @@ class ResumeScreener:
189
  with torch.no_grad():
190
  outputs = self.model(**inputs)
191
 
192
- # Use [CLS] token embedding or mean pooling based on model architecture
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  if hasattr(outputs, "last_hidden_state"):
194
  # Mean pooling across token dimension
195
  embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
@@ -200,18 +217,48 @@ class ResumeScreener:
200
  self.embedding_size = embedding_np.shape[0]
201
 
202
  return embedding_np
203
- else:
204
- # For models that return a specific embedding
 
 
 
 
 
 
 
 
 
 
205
  embedding_np = outputs.cpu().detach().numpy()
206
 
207
  # Set embedding size if not set
208
  if self.embedding_size is None:
209
- self.embedding_size = embedding_np.shape[0]
210
 
211
- return embedding_np
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  except Exception as e:
213
  st.error(f"Error generating embedding: {str(e)}")
214
- return np.zeros(768) # Default embedding size as fallback
 
 
215
 
216
  def create_faiss_index(self, embeddings):
217
  """Create a FAISS index for fast similarity search"""
@@ -249,13 +296,40 @@ class ResumeScreener:
249
  # Prepare corpus from resumes
250
  corpus = [word_tokenize(resume.lower()) for resume in resume_texts]
251
 
252
- # Initialize BM25
253
- bm25 = BM25Okapi(corpus)
254
-
255
- # Calculate scores
256
- scores = bm25.get_scores(job_tokens)
 
 
 
 
 
257
 
258
- return scores
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
  def calculate_hybrid_scores(self, resume_texts, resume_embeddings, job_embedding, semantic_weight=0.7, use_faiss=True):
261
  """Calculate hybrid scores combining semantic similarity and BM25"""
 
189
  with torch.no_grad():
190
  outputs = self.model(**inputs)
191
 
192
+ # Handle specific case for NV-Embed-v2 which returns a nested structure
193
+ if self.embedding_model_name == "nvidia/NV-Embed-v2":
194
+ # Access the embedding from the NV-Embed specific output format
195
+ if hasattr(outputs, "pooler_output"):
196
+ embeddings = outputs.pooler_output
197
+ embedding_np = embeddings.cpu().detach().numpy()
198
+ if self.embedding_size is None:
199
+ self.embedding_size = embedding_np.shape[1]
200
+ return embedding_np[0] # Return the first embedding
201
+ # Try to handle multi-level dictionary if the model changed output format
202
+ elif isinstance(outputs, dict) and "embedding" in outputs:
203
+ embeddings = outputs["embedding"]
204
+ embedding_np = embeddings.cpu().detach().numpy()
205
+ if self.embedding_size is None:
206
+ self.embedding_size = embedding_np.shape[1]
207
+ return embedding_np[0]
208
+
209
+ # Handle different output structures
210
  if hasattr(outputs, "last_hidden_state"):
211
  # Mean pooling across token dimension
212
  embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
 
217
  self.embedding_size = embedding_np.shape[0]
218
 
219
  return embedding_np
220
+ elif isinstance(outputs, dict) and "embeddings" in outputs:
221
+ # For models that return a dictionary with embeddings
222
+ embeddings = outputs["embeddings"]
223
+ embedding_np = embeddings.cpu().detach().numpy()
224
+
225
+ # Set embedding size if not set
226
+ if self.embedding_size is None:
227
+ self.embedding_size = embedding_np.shape[1] # Use correct dimension
228
+
229
+ return embedding_np[0] # Return the first embedding
230
+ elif isinstance(outputs, torch.Tensor):
231
+ # For models that return a tensor directly
232
  embedding_np = outputs.cpu().detach().numpy()
233
 
234
  # Set embedding size if not set
235
  if self.embedding_size is None:
236
+ self.embedding_size = embedding_np.shape[-1]
237
 
238
+ return embedding_np.squeeze()
239
+ else:
240
+ # If we can't determine the output structure, try to inspect it for debugging
241
+ st.warning(f"Unexpected output structure from model: {type(outputs)}")
242
+ if hasattr(outputs, "__dict__"):
243
+ for attr_name in dir(outputs):
244
+ if not attr_name.startswith('_'):
245
+ attr = getattr(outputs, attr_name)
246
+ if isinstance(attr, torch.Tensor):
247
+ st.info(f"Found tensor attribute '{attr_name}' with shape {attr.shape}")
248
+ embedding_np = attr.cpu().detach().numpy()
249
+ if self.embedding_size is None:
250
+ self.embedding_size = embedding_np.shape[-1]
251
+ return embedding_np.squeeze()
252
+
253
+ # Last resort: return zeros
254
+ if self.embedding_size is None:
255
+ self.embedding_size = 768 # Default size
256
+ return np.zeros(self.embedding_size)
257
  except Exception as e:
258
  st.error(f"Error generating embedding: {str(e)}")
259
+ if self.embedding_size is None:
260
+ self.embedding_size = 768 # Default size
261
+ return np.zeros(self.embedding_size)
262
 
263
  def create_faiss_index(self, embeddings):
264
  """Create a FAISS index for fast similarity search"""
 
296
  # Prepare corpus from resumes
297
  corpus = [word_tokenize(resume.lower()) for resume in resume_texts]
298
 
299
+ # Check if corpus is empty
300
+ if not corpus or len(corpus) == 0:
301
+ st.error("No resume texts provided for BM25 calculation")
302
+ return [0.0] * len(resume_texts)
303
+
304
+ # Check for empty documents in corpus
305
+ filtered_corpus = [doc for doc in corpus if len(doc) > 0]
306
+ if not filtered_corpus:
307
+ st.error("All resume texts are empty after tokenization")
308
+ return [0.0] * len(resume_texts)
309
 
310
+ # Initialize BM25
311
+ try:
312
+ bm25 = BM25Okapi(filtered_corpus)
313
+
314
+ # Calculate scores
315
+ scores = bm25.get_scores(job_tokens)
316
+
317
+ # If we filtered out empty documents, we need to reconstruct the scores array
318
+ if len(filtered_corpus) != len(corpus):
319
+ full_scores = []
320
+ filtered_idx = 0
321
+ for i in range(len(corpus)):
322
+ if len(corpus[i]) > 0:
323
+ full_scores.append(scores[filtered_idx])
324
+ filtered_idx += 1
325
+ else:
326
+ full_scores.append(0.0)
327
+ return full_scores
328
+ else:
329
+ return scores
330
+ except Exception as e:
331
+ st.error(f"Error in BM25 calculation: {str(e)}")
332
+ return [0.0] * len(resume_texts)
333
 
334
  def calculate_hybrid_scores(self, resume_texts, resume_embeddings, job_embedding, semantic_weight=0.7, use_faiss=True):
335
  """Calculate hybrid scores combining semantic similarity and BM25"""