root
commited on
Commit
·
c456d7a
1
Parent(s):
eee21aa
ss
Browse files- alt_models.py +61 -13
- app.py +82 -16
- requirements.txt +2 -2
alt_models.py
CHANGED
@@ -5,6 +5,12 @@ Alternative model loading implementation without sys.modules patching
|
|
5 |
import torch
|
6 |
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
def load_embedding_model(model_name="nvidia/NV-Embed-v2"):
|
9 |
"""Load the embedding model with a try-except approach instead of module patching"""
|
10 |
try:
|
@@ -20,12 +26,26 @@ def load_embedding_model(model_name="nvidia/NV-Embed-v2"):
|
|
20 |
def forward(self, *args, **kwargs):
|
21 |
return self.module(*args, **kwargs)
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
# Try the standard loading approach
|
24 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
25 |
model = AutoModel.from_pretrained(
|
26 |
model_name,
|
27 |
trust_remote_code=True,
|
28 |
-
device_map=
|
29 |
)
|
30 |
|
31 |
print(f"Successfully loaded {model_name}")
|
@@ -46,11 +66,20 @@ def load_embedding_model(model_name="nvidia/NV-Embed-v2"):
|
|
46 |
model_class.__module_dict__ = {}
|
47 |
model_class.__module_dict__["Replicate"] = Replicate
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
# Try loading with the augmented namespace
|
50 |
model = model_class.from_pretrained(
|
51 |
model_name,
|
52 |
trust_remote_code=True,
|
53 |
-
device_map=
|
54 |
)
|
55 |
|
56 |
print(f"Successfully loaded {model_name} with alternative approach")
|
@@ -65,13 +94,31 @@ def load_explanation_model(model_name="Qwen/QwQ-32B"):
|
|
65 |
try:
|
66 |
print(f"Loading explanation model {model_name}...")
|
67 |
|
68 |
-
#
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
# Create a simple Replicate class that may be needed
|
77 |
class Replicate(torch.nn.Module):
|
@@ -88,14 +135,15 @@ def load_explanation_model(model_name="Qwen/QwQ-32B"):
|
|
88 |
|
89 |
# Check if we have enough resources to load the model
|
90 |
if torch.cuda.is_available():
|
91 |
-
|
92 |
-
if
|
93 |
model = AutoModelForCausalLM.from_pretrained(
|
94 |
model_name,
|
95 |
quantization_config=quantization_config,
|
96 |
-
device_map=
|
97 |
trust_remote_code=True,
|
98 |
-
torch_dtype=torch.float16
|
|
|
99 |
)
|
100 |
print(f"Successfully loaded {model_name}")
|
101 |
return model, tokenizer
|
|
|
5 |
import torch
|
6 |
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
7 |
|
8 |
+
def count_gpus():
|
9 |
+
"""Count the number of available GPUs"""
|
10 |
+
if torch.cuda.is_available():
|
11 |
+
return torch.cuda.device_count()
|
12 |
+
return 0
|
13 |
+
|
14 |
def load_embedding_model(model_name="nvidia/NV-Embed-v2"):
|
15 |
"""Load the embedding model with a try-except approach instead of module patching"""
|
16 |
try:
|
|
|
26 |
def forward(self, *args, **kwargs):
|
27 |
return self.module(*args, **kwargs)
|
28 |
|
29 |
+
# Get number of GPUs
|
30 |
+
num_gpus = count_gpus()
|
31 |
+
print(f"Found {num_gpus} GPUs")
|
32 |
+
|
33 |
+
# Choose device map strategy based on GPU count
|
34 |
+
if num_gpus > 1:
|
35 |
+
# For multi-GPU setup, use balanced distribution
|
36 |
+
device_map = "balanced"
|
37 |
+
print(f"Using balanced device mapping across {num_gpus} GPUs")
|
38 |
+
else:
|
39 |
+
# For single GPU, use auto or specific mapping based on memory
|
40 |
+
device_map = "auto"
|
41 |
+
print("Using automatic device mapping")
|
42 |
+
|
43 |
# Try the standard loading approach
|
44 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
45 |
model = AutoModel.from_pretrained(
|
46 |
model_name,
|
47 |
trust_remote_code=True,
|
48 |
+
device_map=device_map
|
49 |
)
|
50 |
|
51 |
print(f"Successfully loaded {model_name}")
|
|
|
66 |
model_class.__module_dict__ = {}
|
67 |
model_class.__module_dict__["Replicate"] = Replicate
|
68 |
|
69 |
+
# Get number of GPUs
|
70 |
+
num_gpus = count_gpus()
|
71 |
+
|
72 |
+
# Choose device map strategy based on GPU count
|
73 |
+
if num_gpus > 1:
|
74 |
+
device_map = "balanced"
|
75 |
+
else:
|
76 |
+
device_map = "auto"
|
77 |
+
|
78 |
# Try loading with the augmented namespace
|
79 |
model = model_class.from_pretrained(
|
80 |
model_name,
|
81 |
trust_remote_code=True,
|
82 |
+
device_map=device_map
|
83 |
)
|
84 |
|
85 |
print(f"Successfully loaded {model_name} with alternative approach")
|
|
|
94 |
try:
|
95 |
print(f"Loading explanation model {model_name}...")
|
96 |
|
97 |
+
# Get number of GPUs
|
98 |
+
num_gpus = count_gpus()
|
99 |
+
print(f"Found {num_gpus} GPUs")
|
100 |
+
|
101 |
+
# Choose quantization and device strategy based on GPU count and memory
|
102 |
+
if num_gpus > 1:
|
103 |
+
# For multi-GPU, use 4-bit quantization and balanced distribution
|
104 |
+
quantization_config = BitsAndBytesConfig(
|
105 |
+
load_in_4bit=True,
|
106 |
+
bnb_4bit_quant_type="nf4",
|
107 |
+
bnb_4bit_compute_dtype=torch.float16,
|
108 |
+
bnb_4bit_use_double_quant=True
|
109 |
+
)
|
110 |
+
device_map = "balanced"
|
111 |
+
print(f"Using 4-bit quantization with balanced device mapping across {num_gpus} GPUs")
|
112 |
+
else:
|
113 |
+
# For single GPU, use more aggressive 4-bit quantization
|
114 |
+
quantization_config = BitsAndBytesConfig(
|
115 |
+
load_in_4bit=True,
|
116 |
+
bnb_4bit_quant_type="nf4",
|
117 |
+
bnb_4bit_compute_dtype=torch.float16,
|
118 |
+
bnb_4bit_use_double_quant=True
|
119 |
+
)
|
120 |
+
device_map = "auto"
|
121 |
+
print("Using 4-bit quantization with automatic device mapping")
|
122 |
|
123 |
# Create a simple Replicate class that may be needed
|
124 |
class Replicate(torch.nn.Module):
|
|
|
135 |
|
136 |
# Check if we have enough resources to load the model
|
137 |
if torch.cuda.is_available():
|
138 |
+
total_gpu_memory = sum([torch.cuda.get_device_properties(i).total_memory for i in range(num_gpus)]) / (1024**3)
|
139 |
+
if num_gpus > 1 or total_gpu_memory >= 16: # 16 GB (reduced thanks to quantization)
|
140 |
model = AutoModelForCausalLM.from_pretrained(
|
141 |
model_name,
|
142 |
quantization_config=quantization_config,
|
143 |
+
device_map=device_map,
|
144 |
trust_remote_code=True,
|
145 |
+
torch_dtype=torch.float16,
|
146 |
+
max_memory={i: f"{int(torch.cuda.get_device_properties(i).total_memory / (1024**3) * 0.9)}GiB" for i in range(num_gpus)}
|
147 |
)
|
148 |
print(f"Successfully loaded {model_name}")
|
149 |
return model, tokenizer
|
app.py
CHANGED
@@ -913,25 +913,91 @@ elif upload_option == "Upload from Dataset":
|
|
913 |
# Process button
|
914 |
if st.button("Find Top Candidates", disabled=not (job_description and resume_texts)):
|
915 |
with st.spinner("Processing job description and resumes..."):
|
|
|
|
|
|
|
916 |
# Get job description embedding
|
917 |
job_embedding = screener.get_embedding(job_description)
|
918 |
|
919 |
-
#
|
920 |
-
|
921 |
-
|
922 |
-
|
923 |
-
|
924 |
-
|
925 |
-
|
926 |
-
|
927 |
-
|
928 |
-
|
929 |
-
|
930 |
-
|
931 |
-
|
932 |
-
|
933 |
-
|
934 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
935 |
|
936 |
# Get top candidates
|
937 |
combined_data = list(zip(file_names, resume_texts, hybrid_scores, semantic_scores, bm25_scores))
|
|
|
913 |
# Process button
|
914 |
if st.button("Find Top Candidates", disabled=not (job_description and resume_texts)):
|
915 |
with st.spinner("Processing job description and resumes..."):
|
916 |
+
# Check if we have too many resumes to process at once
|
917 |
+
large_dataset = len(resume_texts) > 1000
|
918 |
+
|
919 |
# Get job description embedding
|
920 |
job_embedding = screener.get_embedding(job_description)
|
921 |
|
922 |
+
# For large datasets, we need to process in batches
|
923 |
+
if large_dataset:
|
924 |
+
st.info(f"Processing {len(resume_texts)} resumes in batches to manage memory usage")
|
925 |
+
|
926 |
+
# Process in batches of 500 resumes
|
927 |
+
batch_size = 500
|
928 |
+
all_hybrid_scores = []
|
929 |
+
all_semantic_scores = []
|
930 |
+
all_bm25_scores = []
|
931 |
+
|
932 |
+
# Calculate BM25 scores first (doesn't require GPU)
|
933 |
+
bm25_scores = screener.calculate_bm25_scores(resume_texts, job_description)
|
934 |
+
|
935 |
+
# Process embeddings in batches
|
936 |
+
progress_bar = st.progress(0)
|
937 |
+
for i in range(0, len(resume_texts), batch_size):
|
938 |
+
batch_end = min(i + batch_size, len(resume_texts))
|
939 |
+
batch_texts = resume_texts[i:batch_end]
|
940 |
+
|
941 |
+
st.info(f"Processing batch {i//batch_size + 1}/{(len(resume_texts) + batch_size - 1)//batch_size} " +
|
942 |
+
f"(resumes {i+1}-{batch_end})")
|
943 |
+
|
944 |
+
# Get resume embeddings for this batch
|
945 |
+
batch_embeddings = []
|
946 |
+
for j, text in enumerate(batch_texts):
|
947 |
+
embedding = screener.get_embedding(text)
|
948 |
+
batch_embeddings.append(embedding)
|
949 |
+
progress = (i + j + 1) / len(resume_texts)
|
950 |
+
progress_bar.progress(progress)
|
951 |
+
|
952 |
+
# Calculate semantic scores for this batch
|
953 |
+
batch_semantic_scores = []
|
954 |
+
for emb in batch_embeddings:
|
955 |
+
# Normalize the embeddings for cosine similarity
|
956 |
+
emb_norm = emb / np.linalg.norm(emb)
|
957 |
+
job_emb_norm = job_embedding / np.linalg.norm(job_embedding)
|
958 |
+
|
959 |
+
# Calculate cosine similarity
|
960 |
+
similarity = np.dot(emb_norm, job_emb_norm)
|
961 |
+
batch_semantic_scores.append(similarity)
|
962 |
+
|
963 |
+
# Store scores for this batch
|
964 |
+
all_semantic_scores.extend(batch_semantic_scores)
|
965 |
+
|
966 |
+
# Explicitly clear GPU memory after processing each batch
|
967 |
+
if torch.cuda.is_available():
|
968 |
+
torch.cuda.empty_cache()
|
969 |
+
|
970 |
+
# Calculate hybrid scores
|
971 |
+
semantic_scores = all_semantic_scores
|
972 |
+
keyword_weight = 1.0 - semantic_weight
|
973 |
+
|
974 |
+
# Normalize BM25 scores if they're not all zeros
|
975 |
+
if bm25_scores and max(bm25_scores) > 0:
|
976 |
+
bm25_scores = [score / max(bm25_scores) for score in bm25_scores]
|
977 |
+
|
978 |
+
# Calculate final hybrid scores
|
979 |
+
hybrid_scores = [
|
980 |
+
(semantic_weight * sem_score) + (keyword_weight * bm25_score)
|
981 |
+
for sem_score, bm25_score in zip(semantic_scores, bm25_scores)
|
982 |
+
]
|
983 |
+
else:
|
984 |
+
# Regular processing for smaller datasets
|
985 |
+
# Get resume embeddings
|
986 |
+
resume_embeddings = []
|
987 |
+
progress_bar = st.progress(0)
|
988 |
+
for i, text in enumerate(resume_texts):
|
989 |
+
embedding = screener.get_embedding(text)
|
990 |
+
resume_embeddings.append(embedding)
|
991 |
+
progress_bar.progress((i + 1) / len(resume_texts))
|
992 |
+
|
993 |
+
# Calculate hybrid scores
|
994 |
+
hybrid_scores, semantic_scores, bm25_scores = screener.calculate_hybrid_scores(
|
995 |
+
resume_texts,
|
996 |
+
resume_embeddings,
|
997 |
+
job_embedding,
|
998 |
+
semantic_weight,
|
999 |
+
use_faiss
|
1000 |
+
)
|
1001 |
|
1002 |
# Get top candidates
|
1003 |
combined_data = list(zip(file_names, resume_texts, hybrid_scores, semantic_scores, bm25_scores))
|
requirements.txt
CHANGED
@@ -4,7 +4,7 @@ PyPDF2==3.0.1
|
|
4 |
python-docx==1.0.1
|
5 |
spacy==3.7.2
|
6 |
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0.tar.gz
|
7 |
-
transformers==4.
|
8 |
torch==2.1.2
|
9 |
nltk==3.8.1
|
10 |
faiss-cpu==1.7.4
|
@@ -14,7 +14,7 @@ plotly==5.18.0
|
|
14 |
pandas==2.1.3
|
15 |
numpy==1.24.3
|
16 |
tqdm==4.66.1
|
17 |
-
huggingface-hub==0.
|
18 |
einops
|
19 |
bitsandbytes>=0.41.0
|
20 |
accelerate>=0.23.0
|
|
|
4 |
python-docx==1.0.1
|
5 |
spacy==3.7.2
|
6 |
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0.tar.gz
|
7 |
+
transformers==4.48.0
|
8 |
torch==2.1.2
|
9 |
nltk==3.8.1
|
10 |
faiss-cpu==1.7.4
|
|
|
14 |
pandas==2.1.3
|
15 |
numpy==1.24.3
|
16 |
tqdm==4.66.1
|
17 |
+
huggingface-hub==0.27.1
|
18 |
einops
|
19 |
bitsandbytes>=0.41.0
|
20 |
accelerate>=0.23.0
|