root
commited on
Commit
·
6ee771e
1
Parent(s):
5f638fd
ss
Browse files
app.py
CHANGED
@@ -189,7 +189,24 @@ class ResumeScreener:
|
|
189 |
with torch.no_grad():
|
190 |
outputs = self.model(**inputs)
|
191 |
|
192 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
if hasattr(outputs, "last_hidden_state"):
|
194 |
# Mean pooling across token dimension
|
195 |
embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
|
@@ -200,18 +217,48 @@ class ResumeScreener:
|
|
200 |
self.embedding_size = embedding_np.shape[0]
|
201 |
|
202 |
return embedding_np
|
203 |
-
|
204 |
-
# For models that return a
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
205 |
embedding_np = outputs.cpu().detach().numpy()
|
206 |
|
207 |
# Set embedding size if not set
|
208 |
if self.embedding_size is None:
|
209 |
-
self.embedding_size = embedding_np.shape[
|
210 |
|
211 |
-
return embedding_np
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
except Exception as e:
|
213 |
st.error(f"Error generating embedding: {str(e)}")
|
214 |
-
|
|
|
|
|
215 |
|
216 |
def create_faiss_index(self, embeddings):
|
217 |
"""Create a FAISS index for fast similarity search"""
|
@@ -249,13 +296,40 @@ class ResumeScreener:
|
|
249 |
# Prepare corpus from resumes
|
250 |
corpus = [word_tokenize(resume.lower()) for resume in resume_texts]
|
251 |
|
252 |
-
#
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
|
|
|
|
|
|
|
|
|
|
257 |
|
258 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
259 |
|
260 |
def calculate_hybrid_scores(self, resume_texts, resume_embeddings, job_embedding, semantic_weight=0.7, use_faiss=True):
|
261 |
"""Calculate hybrid scores combining semantic similarity and BM25"""
|
|
|
189 |
with torch.no_grad():
|
190 |
outputs = self.model(**inputs)
|
191 |
|
192 |
+
# Handle specific case for NV-Embed-v2 which returns a nested structure
|
193 |
+
if self.embedding_model_name == "nvidia/NV-Embed-v2":
|
194 |
+
# Access the embedding from the NV-Embed specific output format
|
195 |
+
if hasattr(outputs, "pooler_output"):
|
196 |
+
embeddings = outputs.pooler_output
|
197 |
+
embedding_np = embeddings.cpu().detach().numpy()
|
198 |
+
if self.embedding_size is None:
|
199 |
+
self.embedding_size = embedding_np.shape[1]
|
200 |
+
return embedding_np[0] # Return the first embedding
|
201 |
+
# Try to handle multi-level dictionary if the model changed output format
|
202 |
+
elif isinstance(outputs, dict) and "embedding" in outputs:
|
203 |
+
embeddings = outputs["embedding"]
|
204 |
+
embedding_np = embeddings.cpu().detach().numpy()
|
205 |
+
if self.embedding_size is None:
|
206 |
+
self.embedding_size = embedding_np.shape[1]
|
207 |
+
return embedding_np[0]
|
208 |
+
|
209 |
+
# Handle different output structures
|
210 |
if hasattr(outputs, "last_hidden_state"):
|
211 |
# Mean pooling across token dimension
|
212 |
embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
|
|
|
217 |
self.embedding_size = embedding_np.shape[0]
|
218 |
|
219 |
return embedding_np
|
220 |
+
elif isinstance(outputs, dict) and "embeddings" in outputs:
|
221 |
+
# For models that return a dictionary with embeddings
|
222 |
+
embeddings = outputs["embeddings"]
|
223 |
+
embedding_np = embeddings.cpu().detach().numpy()
|
224 |
+
|
225 |
+
# Set embedding size if not set
|
226 |
+
if self.embedding_size is None:
|
227 |
+
self.embedding_size = embedding_np.shape[1] # Use correct dimension
|
228 |
+
|
229 |
+
return embedding_np[0] # Return the first embedding
|
230 |
+
elif isinstance(outputs, torch.Tensor):
|
231 |
+
# For models that return a tensor directly
|
232 |
embedding_np = outputs.cpu().detach().numpy()
|
233 |
|
234 |
# Set embedding size if not set
|
235 |
if self.embedding_size is None:
|
236 |
+
self.embedding_size = embedding_np.shape[-1]
|
237 |
|
238 |
+
return embedding_np.squeeze()
|
239 |
+
else:
|
240 |
+
# If we can't determine the output structure, try to inspect it for debugging
|
241 |
+
st.warning(f"Unexpected output structure from model: {type(outputs)}")
|
242 |
+
if hasattr(outputs, "__dict__"):
|
243 |
+
for attr_name in dir(outputs):
|
244 |
+
if not attr_name.startswith('_'):
|
245 |
+
attr = getattr(outputs, attr_name)
|
246 |
+
if isinstance(attr, torch.Tensor):
|
247 |
+
st.info(f"Found tensor attribute '{attr_name}' with shape {attr.shape}")
|
248 |
+
embedding_np = attr.cpu().detach().numpy()
|
249 |
+
if self.embedding_size is None:
|
250 |
+
self.embedding_size = embedding_np.shape[-1]
|
251 |
+
return embedding_np.squeeze()
|
252 |
+
|
253 |
+
# Last resort: return zeros
|
254 |
+
if self.embedding_size is None:
|
255 |
+
self.embedding_size = 768 # Default size
|
256 |
+
return np.zeros(self.embedding_size)
|
257 |
except Exception as e:
|
258 |
st.error(f"Error generating embedding: {str(e)}")
|
259 |
+
if self.embedding_size is None:
|
260 |
+
self.embedding_size = 768 # Default size
|
261 |
+
return np.zeros(self.embedding_size)
|
262 |
|
263 |
def create_faiss_index(self, embeddings):
|
264 |
"""Create a FAISS index for fast similarity search"""
|
|
|
296 |
# Prepare corpus from resumes
|
297 |
corpus = [word_tokenize(resume.lower()) for resume in resume_texts]
|
298 |
|
299 |
+
# Check if corpus is empty
|
300 |
+
if not corpus or len(corpus) == 0:
|
301 |
+
st.error("No resume texts provided for BM25 calculation")
|
302 |
+
return [0.0] * len(resume_texts)
|
303 |
+
|
304 |
+
# Check for empty documents in corpus
|
305 |
+
filtered_corpus = [doc for doc in corpus if len(doc) > 0]
|
306 |
+
if not filtered_corpus:
|
307 |
+
st.error("All resume texts are empty after tokenization")
|
308 |
+
return [0.0] * len(resume_texts)
|
309 |
|
310 |
+
# Initialize BM25
|
311 |
+
try:
|
312 |
+
bm25 = BM25Okapi(filtered_corpus)
|
313 |
+
|
314 |
+
# Calculate scores
|
315 |
+
scores = bm25.get_scores(job_tokens)
|
316 |
+
|
317 |
+
# If we filtered out empty documents, we need to reconstruct the scores array
|
318 |
+
if len(filtered_corpus) != len(corpus):
|
319 |
+
full_scores = []
|
320 |
+
filtered_idx = 0
|
321 |
+
for i in range(len(corpus)):
|
322 |
+
if len(corpus[i]) > 0:
|
323 |
+
full_scores.append(scores[filtered_idx])
|
324 |
+
filtered_idx += 1
|
325 |
+
else:
|
326 |
+
full_scores.append(0.0)
|
327 |
+
return full_scores
|
328 |
+
else:
|
329 |
+
return scores
|
330 |
+
except Exception as e:
|
331 |
+
st.error(f"Error in BM25 calculation: {str(e)}")
|
332 |
+
return [0.0] * len(resume_texts)
|
333 |
|
334 |
def calculate_hybrid_scores(self, resume_texts, resume_embeddings, job_embedding, semantic_weight=0.7, use_faiss=True):
|
335 |
"""Calculate hybrid scores combining semantic similarity and BM25"""
|