Spaces:

jacob-c
/

Resume_Screener_and_Skill_Extractor

Paused

App Files Files Community

root commited on May 21

Commit

c456d7a

1 Parent(s): eee21aa

ss

Browse files

Files changed (3) hide show

alt_models.py +61 -13
app.py +82 -16
requirements.txt +2 -2

alt_models.py CHANGED Viewed

@@ -5,6 +5,12 @@ Alternative model loading implementation without sys.modules patching
 import torch
 from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 def load_embedding_model(model_name="nvidia/NV-Embed-v2"):
     """Load the embedding model with a try-except approach instead of module patching"""
     try:
@@ -20,12 +26,26 @@ def load_embedding_model(model_name="nvidia/NV-Embed-v2"):
             def forward(self, *args, **kwargs):
                 return self.module(*args, **kwargs)
         # Try the standard loading approach
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         model = AutoModel.from_pretrained(
             model_name,
             trust_remote_code=True,
-            device_map="auto"
         )
         print(f"Successfully loaded {model_name}")
@@ -46,11 +66,20 @@ def load_embedding_model(model_name="nvidia/NV-Embed-v2"):
             model_class.__module_dict__ = {}
             model_class.__module_dict__["Replicate"] = Replicate
             # Try loading with the augmented namespace
             model = model_class.from_pretrained(
                 model_name,
                 trust_remote_code=True,
-                device_map="auto"
             )
             print(f"Successfully loaded {model_name} with alternative approach")
@@ -65,13 +94,31 @@ def load_explanation_model(model_name="Qwen/QwQ-32B"):
     try:
         print(f"Loading explanation model {model_name}...")
-        # Configure 4-bit quantization for better performance
-        quantization_config = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_quant_type="nf4",
-            bnb_4bit_compute_dtype=torch.float16,
-            bnb_4bit_use_double_quant=True
-        )
         # Create a simple Replicate class that may be needed
         class Replicate(torch.nn.Module):
@@ -88,14 +135,15 @@ def load_explanation_model(model_name="Qwen/QwQ-32B"):
         # Check if we have enough resources to load the model
         if torch.cuda.is_available():
-            gpu_memory = torch.cuda.get_device_properties(0).total_memory
-            if gpu_memory >= 16 * (1024**3):  # 16 GB (reduced thanks to quantization)
                 model = AutoModelForCausalLM.from_pretrained(
                     model_name,
                     quantization_config=quantization_config,
-                    device_map="auto",
                     trust_remote_code=True,
-                    torch_dtype=torch.float16
                 )
                 print(f"Successfully loaded {model_name}")
                 return model, tokenizer

 import torch
 from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+def count_gpus():
+    """Count the number of available GPUs"""
+    if torch.cuda.is_available():
+        return torch.cuda.device_count()
+    return 0
 def load_embedding_model(model_name="nvidia/NV-Embed-v2"):
     """Load the embedding model with a try-except approach instead of module patching"""
     try:
             def forward(self, *args, **kwargs):
                 return self.module(*args, **kwargs)
+        # Get number of GPUs
+        num_gpus = count_gpus()
+        print(f"Found {num_gpus} GPUs")
+        # Choose device map strategy based on GPU count
+        if num_gpus > 1:
+            # For multi-GPU setup, use balanced distribution
+            device_map = "balanced"
+            print(f"Using balanced device mapping across {num_gpus} GPUs")
+        else:
+            # For single GPU, use auto or specific mapping based on memory
+            device_map = "auto"
+            print("Using automatic device mapping")
         # Try the standard loading approach
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         model = AutoModel.from_pretrained(
             model_name,
             trust_remote_code=True,
+            device_map=device_map
         )
         print(f"Successfully loaded {model_name}")
             model_class.__module_dict__ = {}
             model_class.__module_dict__["Replicate"] = Replicate
+            # Get number of GPUs
+            num_gpus = count_gpus()
+            # Choose device map strategy based on GPU count
+            if num_gpus > 1:
+                device_map = "balanced"
+            else:
+                device_map = "auto"
             # Try loading with the augmented namespace
             model = model_class.from_pretrained(
                 model_name,
                 trust_remote_code=True,
+                device_map=device_map
             )
             print(f"Successfully loaded {model_name} with alternative approach")
     try:
         print(f"Loading explanation model {model_name}...")
+        # Get number of GPUs
+        num_gpus = count_gpus()
+        print(f"Found {num_gpus} GPUs")
+        # Choose quantization and device strategy based on GPU count and memory
+        if num_gpus > 1:
+            # For multi-GPU, use 4-bit quantization and balanced distribution
+            quantization_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=torch.float16,
+                bnb_4bit_use_double_quant=True
+            )
+            device_map = "balanced"
+            print(f"Using 4-bit quantization with balanced device mapping across {num_gpus} GPUs")
+        else:
+            # For single GPU, use more aggressive 4-bit quantization
+            quantization_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=torch.float16,
+                bnb_4bit_use_double_quant=True
+            )
+            device_map = "auto"
+            print("Using 4-bit quantization with automatic device mapping")
         # Create a simple Replicate class that may be needed
         class Replicate(torch.nn.Module):
         # Check if we have enough resources to load the model
         if torch.cuda.is_available():
+            total_gpu_memory = sum([torch.cuda.get_device_properties(i).total_memory for i in range(num_gpus)]) / (1024**3)
+            if num_gpus > 1 or total_gpu_memory >= 16:  # 16 GB (reduced thanks to quantization)
                 model = AutoModelForCausalLM.from_pretrained(
                     model_name,
                     quantization_config=quantization_config,
+                    device_map=device_map,
                     trust_remote_code=True,
+                    torch_dtype=torch.float16,
+                    max_memory={i: f"{int(torch.cuda.get_device_properties(i).total_memory / (1024**3) * 0.9)}GiB" for i in range(num_gpus)}
                 )
                 print(f"Successfully loaded {model_name}")
                 return model, tokenizer

app.py CHANGED Viewed

@@ -913,25 +913,91 @@ elif upload_option == "Upload from Dataset":
 # Process button
 if st.button("Find Top Candidates", disabled=not (job_description and resume_texts)):
     with st.spinner("Processing job description and resumes..."):
         # Get job description embedding
         job_embedding = screener.get_embedding(job_description)
-        # Get resume embeddings
-        resume_embeddings = []
-        progress_bar = st.progress(0)
-        for i, text in enumerate(resume_texts):
-            embedding = screener.get_embedding(text)
-            resume_embeddings.append(embedding)
-            progress_bar.progress((i + 1) / len(resume_texts))
-        # Calculate hybrid scores
-        hybrid_scores, semantic_scores, bm25_scores = screener.calculate_hybrid_scores(
-            resume_texts,
-            resume_embeddings,
-            job_embedding,
-            semantic_weight,
-            use_faiss
-        )
         # Get top candidates
         combined_data = list(zip(file_names, resume_texts, hybrid_scores, semantic_scores, bm25_scores))

 # Process button
 if st.button("Find Top Candidates", disabled=not (job_description and resume_texts)):
     with st.spinner("Processing job description and resumes..."):
+        # Check if we have too many resumes to process at once
+        large_dataset = len(resume_texts) > 1000
         # Get job description embedding
         job_embedding = screener.get_embedding(job_description)
+        # For large datasets, we need to process in batches
+        if large_dataset:
+            st.info(f"Processing {len(resume_texts)} resumes in batches to manage memory usage")
+            # Process in batches of 500 resumes
+            batch_size = 500
+            all_hybrid_scores = []
+            all_semantic_scores = []
+            all_bm25_scores = []
+            # Calculate BM25 scores first (doesn't require GPU)
+            bm25_scores = screener.calculate_bm25_scores(resume_texts, job_description)
+            # Process embeddings in batches
+            progress_bar = st.progress(0)
+            for i in range(0, len(resume_texts), batch_size):
+                batch_end = min(i + batch_size, len(resume_texts))
+                batch_texts = resume_texts[i:batch_end]
+                st.info(f"Processing batch {i//batch_size + 1}/{(len(resume_texts) + batch_size - 1)//batch_size} " +
+                        f"(resumes {i+1}-{batch_end})")
+                # Get resume embeddings for this batch
+                batch_embeddings = []
+                for j, text in enumerate(batch_texts):
+                    embedding = screener.get_embedding(text)
+                    batch_embeddings.append(embedding)
+                    progress = (i + j + 1) / len(resume_texts)
+                    progress_bar.progress(progress)
+                # Calculate semantic scores for this batch
+                batch_semantic_scores = []
+                for emb in batch_embeddings:
+                    # Normalize the embeddings for cosine similarity
+                    emb_norm = emb / np.linalg.norm(emb)
+                    job_emb_norm = job_embedding / np.linalg.norm(job_embedding)
+                    # Calculate cosine similarity
+                    similarity = np.dot(emb_norm, job_emb_norm)
+                    batch_semantic_scores.append(similarity)
+                # Store scores for this batch
+                all_semantic_scores.extend(batch_semantic_scores)
+                # Explicitly clear GPU memory after processing each batch
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+            # Calculate hybrid scores
+            semantic_scores = all_semantic_scores
+            keyword_weight = 1.0 - semantic_weight
+            # Normalize BM25 scores if they're not all zeros
+            if bm25_scores and max(bm25_scores) > 0:
+                bm25_scores = [score / max(bm25_scores) for score in bm25_scores]
+            # Calculate final hybrid scores
+            hybrid_scores = [
+                (semantic_weight * sem_score) + (keyword_weight * bm25_score)
+                for sem_score, bm25_score in zip(semantic_scores, bm25_scores)
+            ]
+        else:
+            # Regular processing for smaller datasets
+            # Get resume embeddings
+            resume_embeddings = []
+            progress_bar = st.progress(0)
+            for i, text in enumerate(resume_texts):
+                embedding = screener.get_embedding(text)
+                resume_embeddings.append(embedding)
+                progress_bar.progress((i + 1) / len(resume_texts))
+            # Calculate hybrid scores
+            hybrid_scores, semantic_scores, bm25_scores = screener.calculate_hybrid_scores(
+                resume_texts,
+                resume_embeddings,
+                job_embedding,
+                semantic_weight,
+                use_faiss
+            )
         # Get top candidates
         combined_data = list(zip(file_names, resume_texts, hybrid_scores, semantic_scores, bm25_scores))

requirements.txt CHANGED Viewed

@@ -4,7 +4,7 @@ PyPDF2==3.0.1
 python-docx==1.0.1
 spacy==3.7.2
 https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0.tar.gz
-transformers==4.36.2
 torch==2.1.2
 nltk==3.8.1
 faiss-cpu==1.7.4
@@ -14,7 +14,7 @@ plotly==5.18.0
 pandas==2.1.3
 numpy==1.24.3
 tqdm==4.66.1
-huggingface-hub==0.25.0
 einops
 bitsandbytes>=0.41.0
 accelerate>=0.23.0

 python-docx==1.0.1
 spacy==3.7.2
 https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0.tar.gz
+transformers==4.48.0
 torch==2.1.2
 nltk==3.8.1
 faiss-cpu==1.7.4
 pandas==2.1.3
 numpy==1.24.3
 tqdm==4.66.1
+huggingface-hub==0.27.1
 einops
 bitsandbytes>=0.41.0
 accelerate>=0.23.0