root commited on
Commit
c456d7a
·
1 Parent(s): eee21aa
Files changed (3) hide show
  1. alt_models.py +61 -13
  2. app.py +82 -16
  3. requirements.txt +2 -2
alt_models.py CHANGED
@@ -5,6 +5,12 @@ Alternative model loading implementation without sys.modules patching
5
  import torch
6
  from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
7
 
 
 
 
 
 
 
8
  def load_embedding_model(model_name="nvidia/NV-Embed-v2"):
9
  """Load the embedding model with a try-except approach instead of module patching"""
10
  try:
@@ -20,12 +26,26 @@ def load_embedding_model(model_name="nvidia/NV-Embed-v2"):
20
  def forward(self, *args, **kwargs):
21
  return self.module(*args, **kwargs)
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  # Try the standard loading approach
24
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
25
  model = AutoModel.from_pretrained(
26
  model_name,
27
  trust_remote_code=True,
28
- device_map="auto"
29
  )
30
 
31
  print(f"Successfully loaded {model_name}")
@@ -46,11 +66,20 @@ def load_embedding_model(model_name="nvidia/NV-Embed-v2"):
46
  model_class.__module_dict__ = {}
47
  model_class.__module_dict__["Replicate"] = Replicate
48
 
 
 
 
 
 
 
 
 
 
49
  # Try loading with the augmented namespace
50
  model = model_class.from_pretrained(
51
  model_name,
52
  trust_remote_code=True,
53
- device_map="auto"
54
  )
55
 
56
  print(f"Successfully loaded {model_name} with alternative approach")
@@ -65,13 +94,31 @@ def load_explanation_model(model_name="Qwen/QwQ-32B"):
65
  try:
66
  print(f"Loading explanation model {model_name}...")
67
 
68
- # Configure 4-bit quantization for better performance
69
- quantization_config = BitsAndBytesConfig(
70
- load_in_4bit=True,
71
- bnb_4bit_quant_type="nf4",
72
- bnb_4bit_compute_dtype=torch.float16,
73
- bnb_4bit_use_double_quant=True
74
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  # Create a simple Replicate class that may be needed
77
  class Replicate(torch.nn.Module):
@@ -88,14 +135,15 @@ def load_explanation_model(model_name="Qwen/QwQ-32B"):
88
 
89
  # Check if we have enough resources to load the model
90
  if torch.cuda.is_available():
91
- gpu_memory = torch.cuda.get_device_properties(0).total_memory
92
- if gpu_memory >= 16 * (1024**3): # 16 GB (reduced thanks to quantization)
93
  model = AutoModelForCausalLM.from_pretrained(
94
  model_name,
95
  quantization_config=quantization_config,
96
- device_map="auto",
97
  trust_remote_code=True,
98
- torch_dtype=torch.float16
 
99
  )
100
  print(f"Successfully loaded {model_name}")
101
  return model, tokenizer
 
5
  import torch
6
  from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
7
 
8
+ def count_gpus():
9
+ """Count the number of available GPUs"""
10
+ if torch.cuda.is_available():
11
+ return torch.cuda.device_count()
12
+ return 0
13
+
14
  def load_embedding_model(model_name="nvidia/NV-Embed-v2"):
15
  """Load the embedding model with a try-except approach instead of module patching"""
16
  try:
 
26
  def forward(self, *args, **kwargs):
27
  return self.module(*args, **kwargs)
28
 
29
+ # Get number of GPUs
30
+ num_gpus = count_gpus()
31
+ print(f"Found {num_gpus} GPUs")
32
+
33
+ # Choose device map strategy based on GPU count
34
+ if num_gpus > 1:
35
+ # For multi-GPU setup, use balanced distribution
36
+ device_map = "balanced"
37
+ print(f"Using balanced device mapping across {num_gpus} GPUs")
38
+ else:
39
+ # For single GPU, use auto or specific mapping based on memory
40
+ device_map = "auto"
41
+ print("Using automatic device mapping")
42
+
43
  # Try the standard loading approach
44
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
45
  model = AutoModel.from_pretrained(
46
  model_name,
47
  trust_remote_code=True,
48
+ device_map=device_map
49
  )
50
 
51
  print(f"Successfully loaded {model_name}")
 
66
  model_class.__module_dict__ = {}
67
  model_class.__module_dict__["Replicate"] = Replicate
68
 
69
+ # Get number of GPUs
70
+ num_gpus = count_gpus()
71
+
72
+ # Choose device map strategy based on GPU count
73
+ if num_gpus > 1:
74
+ device_map = "balanced"
75
+ else:
76
+ device_map = "auto"
77
+
78
  # Try loading with the augmented namespace
79
  model = model_class.from_pretrained(
80
  model_name,
81
  trust_remote_code=True,
82
+ device_map=device_map
83
  )
84
 
85
  print(f"Successfully loaded {model_name} with alternative approach")
 
94
  try:
95
  print(f"Loading explanation model {model_name}...")
96
 
97
+ # Get number of GPUs
98
+ num_gpus = count_gpus()
99
+ print(f"Found {num_gpus} GPUs")
100
+
101
+ # Choose quantization and device strategy based on GPU count and memory
102
+ if num_gpus > 1:
103
+ # For multi-GPU, use 4-bit quantization and balanced distribution
104
+ quantization_config = BitsAndBytesConfig(
105
+ load_in_4bit=True,
106
+ bnb_4bit_quant_type="nf4",
107
+ bnb_4bit_compute_dtype=torch.float16,
108
+ bnb_4bit_use_double_quant=True
109
+ )
110
+ device_map = "balanced"
111
+ print(f"Using 4-bit quantization with balanced device mapping across {num_gpus} GPUs")
112
+ else:
113
+ # For single GPU, use more aggressive 4-bit quantization
114
+ quantization_config = BitsAndBytesConfig(
115
+ load_in_4bit=True,
116
+ bnb_4bit_quant_type="nf4",
117
+ bnb_4bit_compute_dtype=torch.float16,
118
+ bnb_4bit_use_double_quant=True
119
+ )
120
+ device_map = "auto"
121
+ print("Using 4-bit quantization with automatic device mapping")
122
 
123
  # Create a simple Replicate class that may be needed
124
  class Replicate(torch.nn.Module):
 
135
 
136
  # Check if we have enough resources to load the model
137
  if torch.cuda.is_available():
138
+ total_gpu_memory = sum([torch.cuda.get_device_properties(i).total_memory for i in range(num_gpus)]) / (1024**3)
139
+ if num_gpus > 1 or total_gpu_memory >= 16: # 16 GB (reduced thanks to quantization)
140
  model = AutoModelForCausalLM.from_pretrained(
141
  model_name,
142
  quantization_config=quantization_config,
143
+ device_map=device_map,
144
  trust_remote_code=True,
145
+ torch_dtype=torch.float16,
146
+ max_memory={i: f"{int(torch.cuda.get_device_properties(i).total_memory / (1024**3) * 0.9)}GiB" for i in range(num_gpus)}
147
  )
148
  print(f"Successfully loaded {model_name}")
149
  return model, tokenizer
app.py CHANGED
@@ -913,25 +913,91 @@ elif upload_option == "Upload from Dataset":
913
  # Process button
914
  if st.button("Find Top Candidates", disabled=not (job_description and resume_texts)):
915
  with st.spinner("Processing job description and resumes..."):
 
 
 
916
  # Get job description embedding
917
  job_embedding = screener.get_embedding(job_description)
918
 
919
- # Get resume embeddings
920
- resume_embeddings = []
921
- progress_bar = st.progress(0)
922
- for i, text in enumerate(resume_texts):
923
- embedding = screener.get_embedding(text)
924
- resume_embeddings.append(embedding)
925
- progress_bar.progress((i + 1) / len(resume_texts))
926
-
927
- # Calculate hybrid scores
928
- hybrid_scores, semantic_scores, bm25_scores = screener.calculate_hybrid_scores(
929
- resume_texts,
930
- resume_embeddings,
931
- job_embedding,
932
- semantic_weight,
933
- use_faiss
934
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
935
 
936
  # Get top candidates
937
  combined_data = list(zip(file_names, resume_texts, hybrid_scores, semantic_scores, bm25_scores))
 
913
  # Process button
914
  if st.button("Find Top Candidates", disabled=not (job_description and resume_texts)):
915
  with st.spinner("Processing job description and resumes..."):
916
+ # Check if we have too many resumes to process at once
917
+ large_dataset = len(resume_texts) > 1000
918
+
919
  # Get job description embedding
920
  job_embedding = screener.get_embedding(job_description)
921
 
922
+ # For large datasets, we need to process in batches
923
+ if large_dataset:
924
+ st.info(f"Processing {len(resume_texts)} resumes in batches to manage memory usage")
925
+
926
+ # Process in batches of 500 resumes
927
+ batch_size = 500
928
+ all_hybrid_scores = []
929
+ all_semantic_scores = []
930
+ all_bm25_scores = []
931
+
932
+ # Calculate BM25 scores first (doesn't require GPU)
933
+ bm25_scores = screener.calculate_bm25_scores(resume_texts, job_description)
934
+
935
+ # Process embeddings in batches
936
+ progress_bar = st.progress(0)
937
+ for i in range(0, len(resume_texts), batch_size):
938
+ batch_end = min(i + batch_size, len(resume_texts))
939
+ batch_texts = resume_texts[i:batch_end]
940
+
941
+ st.info(f"Processing batch {i//batch_size + 1}/{(len(resume_texts) + batch_size - 1)//batch_size} " +
942
+ f"(resumes {i+1}-{batch_end})")
943
+
944
+ # Get resume embeddings for this batch
945
+ batch_embeddings = []
946
+ for j, text in enumerate(batch_texts):
947
+ embedding = screener.get_embedding(text)
948
+ batch_embeddings.append(embedding)
949
+ progress = (i + j + 1) / len(resume_texts)
950
+ progress_bar.progress(progress)
951
+
952
+ # Calculate semantic scores for this batch
953
+ batch_semantic_scores = []
954
+ for emb in batch_embeddings:
955
+ # Normalize the embeddings for cosine similarity
956
+ emb_norm = emb / np.linalg.norm(emb)
957
+ job_emb_norm = job_embedding / np.linalg.norm(job_embedding)
958
+
959
+ # Calculate cosine similarity
960
+ similarity = np.dot(emb_norm, job_emb_norm)
961
+ batch_semantic_scores.append(similarity)
962
+
963
+ # Store scores for this batch
964
+ all_semantic_scores.extend(batch_semantic_scores)
965
+
966
+ # Explicitly clear GPU memory after processing each batch
967
+ if torch.cuda.is_available():
968
+ torch.cuda.empty_cache()
969
+
970
+ # Calculate hybrid scores
971
+ semantic_scores = all_semantic_scores
972
+ keyword_weight = 1.0 - semantic_weight
973
+
974
+ # Normalize BM25 scores if they're not all zeros
975
+ if bm25_scores and max(bm25_scores) > 0:
976
+ bm25_scores = [score / max(bm25_scores) for score in bm25_scores]
977
+
978
+ # Calculate final hybrid scores
979
+ hybrid_scores = [
980
+ (semantic_weight * sem_score) + (keyword_weight * bm25_score)
981
+ for sem_score, bm25_score in zip(semantic_scores, bm25_scores)
982
+ ]
983
+ else:
984
+ # Regular processing for smaller datasets
985
+ # Get resume embeddings
986
+ resume_embeddings = []
987
+ progress_bar = st.progress(0)
988
+ for i, text in enumerate(resume_texts):
989
+ embedding = screener.get_embedding(text)
990
+ resume_embeddings.append(embedding)
991
+ progress_bar.progress((i + 1) / len(resume_texts))
992
+
993
+ # Calculate hybrid scores
994
+ hybrid_scores, semantic_scores, bm25_scores = screener.calculate_hybrid_scores(
995
+ resume_texts,
996
+ resume_embeddings,
997
+ job_embedding,
998
+ semantic_weight,
999
+ use_faiss
1000
+ )
1001
 
1002
  # Get top candidates
1003
  combined_data = list(zip(file_names, resume_texts, hybrid_scores, semantic_scores, bm25_scores))
requirements.txt CHANGED
@@ -4,7 +4,7 @@ PyPDF2==3.0.1
4
  python-docx==1.0.1
5
  spacy==3.7.2
6
  https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0.tar.gz
7
- transformers==4.36.2
8
  torch==2.1.2
9
  nltk==3.8.1
10
  faiss-cpu==1.7.4
@@ -14,7 +14,7 @@ plotly==5.18.0
14
  pandas==2.1.3
15
  numpy==1.24.3
16
  tqdm==4.66.1
17
- huggingface-hub==0.25.0
18
  einops
19
  bitsandbytes>=0.41.0
20
  accelerate>=0.23.0
 
4
  python-docx==1.0.1
5
  spacy==3.7.2
6
  https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0.tar.gz
7
+ transformers==4.48.0
8
  torch==2.1.2
9
  nltk==3.8.1
10
  faiss-cpu==1.7.4
 
14
  pandas==2.1.3
15
  numpy==1.24.3
16
  tqdm==4.66.1
17
+ huggingface-hub==0.27.1
18
  einops
19
  bitsandbytes>=0.41.0
20
  accelerate>=0.23.0