Spaces:

jacob-c
/

Resume_Screener_and_Skill_Extractor

Paused

App Files Files Community

root commited on May 21

Commit

6ee771e

1 Parent(s): 5f638fd

ss

Browse files

Files changed (1) hide show

app.py +86 -12

app.py CHANGED Viewed

@@ -189,7 +189,24 @@ class ResumeScreener:
             with torch.no_grad():
                 outputs = self.model(**inputs)
-            # Use [CLS] token embedding or mean pooling based on model architecture
             if hasattr(outputs, "last_hidden_state"):
                 # Mean pooling across token dimension
                 embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
@@ -200,18 +217,48 @@ class ResumeScreener:
                     self.embedding_size = embedding_np.shape[0]
                 return embedding_np
-            else:
-                # For models that return a specific embedding
                 embedding_np = outputs.cpu().detach().numpy()
                 # Set embedding size if not set
                 if self.embedding_size is None:
-                    self.embedding_size = embedding_np.shape[0]
-                return embedding_np
         except Exception as e:
             st.error(f"Error generating embedding: {str(e)}")
-            return np.zeros(768)  # Default embedding size as fallback
     def create_faiss_index(self, embeddings):
         """Create a FAISS index for fast similarity search"""
@@ -249,13 +296,40 @@ class ResumeScreener:
         # Prepare corpus from resumes
         corpus = [word_tokenize(resume.lower()) for resume in resume_texts]
-        # Initialize BM25
-        bm25 = BM25Okapi(corpus)
-        # Calculate scores
-        scores = bm25.get_scores(job_tokens)
-        return scores
     def calculate_hybrid_scores(self, resume_texts, resume_embeddings, job_embedding, semantic_weight=0.7, use_faiss=True):
         """Calculate hybrid scores combining semantic similarity and BM25"""

             with torch.no_grad():
                 outputs = self.model(**inputs)
+            # Handle specific case for NV-Embed-v2 which returns a nested structure
+            if self.embedding_model_name == "nvidia/NV-Embed-v2":
+                # Access the embedding from the NV-Embed specific output format
+                if hasattr(outputs, "pooler_output"):
+                    embeddings = outputs.pooler_output
+                    embedding_np = embeddings.cpu().detach().numpy()
+                    if self.embedding_size is None:
+                        self.embedding_size = embedding_np.shape[1]
+                    return embedding_np[0]  # Return the first embedding
+                # Try to handle multi-level dictionary if the model changed output format
+                elif isinstance(outputs, dict) and "embedding" in outputs:
+                    embeddings = outputs["embedding"]
+                    embedding_np = embeddings.cpu().detach().numpy()
+                    if self.embedding_size is None:
+                        self.embedding_size = embedding_np.shape[1]
+                    return embedding_np[0]
+            # Handle different output structures
             if hasattr(outputs, "last_hidden_state"):
                 # Mean pooling across token dimension
                 embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
                     self.embedding_size = embedding_np.shape[0]
                 return embedding_np
+            elif isinstance(outputs, dict) and "embeddings" in outputs:
+                # For models that return a dictionary with embeddings
+                embeddings = outputs["embeddings"]
+                embedding_np = embeddings.cpu().detach().numpy()
+                # Set embedding size if not set
+                if self.embedding_size is None:
+                    self.embedding_size = embedding_np.shape[1]  # Use correct dimension
+                return embedding_np[0]  # Return the first embedding
+            elif isinstance(outputs, torch.Tensor):
+                # For models that return a tensor directly
                 embedding_np = outputs.cpu().detach().numpy()
                 # Set embedding size if not set
                 if self.embedding_size is None:
+                    self.embedding_size = embedding_np.shape[-1]
+                return embedding_np.squeeze()
+            else:
+                # If we can't determine the output structure, try to inspect it for debugging
+                st.warning(f"Unexpected output structure from model: {type(outputs)}")
+                if hasattr(outputs, "__dict__"):
+                    for attr_name in dir(outputs):
+                        if not attr_name.startswith('_'):
+                            attr = getattr(outputs, attr_name)
+                            if isinstance(attr, torch.Tensor):
+                                st.info(f"Found tensor attribute '{attr_name}' with shape {attr.shape}")
+                                embedding_np = attr.cpu().detach().numpy()
+                                if self.embedding_size is None:
+                                    self.embedding_size = embedding_np.shape[-1]
+                                return embedding_np.squeeze()
+                # Last resort: return zeros
+                if self.embedding_size is None:
+                    self.embedding_size = 768  # Default size
+                return np.zeros(self.embedding_size)
         except Exception as e:
             st.error(f"Error generating embedding: {str(e)}")
+            if self.embedding_size is None:
+                self.embedding_size = 768  # Default size
+            return np.zeros(self.embedding_size)
     def create_faiss_index(self, embeddings):
         """Create a FAISS index for fast similarity search"""
         # Prepare corpus from resumes
         corpus = [word_tokenize(resume.lower()) for resume in resume_texts]
+        # Check if corpus is empty
+        if not corpus or len(corpus) == 0:
+            st.error("No resume texts provided for BM25 calculation")
+            return [0.0] * len(resume_texts)
+        # Check for empty documents in corpus
+        filtered_corpus = [doc for doc in corpus if len(doc) > 0]
+        if not filtered_corpus:
+            st.error("All resume texts are empty after tokenization")
+            return [0.0] * len(resume_texts)
+        # Initialize BM25
+        try:
+            bm25 = BM25Okapi(filtered_corpus)
+            # Calculate scores
+            scores = bm25.get_scores(job_tokens)
+            # If we filtered out empty documents, we need to reconstruct the scores array
+            if len(filtered_corpus) != len(corpus):
+                full_scores = []
+                filtered_idx = 0
+                for i in range(len(corpus)):
+                    if len(corpus[i]) > 0:
+                        full_scores.append(scores[filtered_idx])
+                        filtered_idx += 1
+                    else:
+                        full_scores.append(0.0)
+                return full_scores
+            else:
+                return scores
+        except Exception as e:
+            st.error(f"Error in BM25 calculation: {str(e)}")
+            return [0.0] * len(resume_texts)
     def calculate_hybrid_scores(self, resume_texts, resume_embeddings, job_embedding, semantic_weight=0.7, use_faiss=True):
         """Calculate hybrid scores combining semantic similarity and BM25"""