root
commited on
Commit
·
6ff5e82
1
Parent(s):
2e8072e
ss
Browse files- alt_models.py +0 -159
- app.py +497 -932
- explanation_generator.py +0 -223
- fix_dependencies.py +0 -76
- requirements.txt +8 -11
alt_models.py
DELETED
@@ -1,159 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Alternative model loading implementation without sys.modules patching
|
3 |
-
"""
|
4 |
-
|
5 |
-
import torch
|
6 |
-
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
7 |
-
|
8 |
-
def count_gpus():
|
9 |
-
"""Count the number of available GPUs"""
|
10 |
-
if torch.cuda.is_available():
|
11 |
-
return torch.cuda.device_count()
|
12 |
-
return 0
|
13 |
-
|
14 |
-
def load_embedding_model(model_name="nvidia/NV-Embed-v2"):
|
15 |
-
"""Load the embedding model with a try-except approach instead of module patching"""
|
16 |
-
try:
|
17 |
-
print(f"Loading embedding model {model_name}...")
|
18 |
-
|
19 |
-
# Create a simple Replicate class that may be needed
|
20 |
-
class Replicate(torch.nn.Module):
|
21 |
-
def __init__(self, module, num_replicas=1):
|
22 |
-
super().__init__()
|
23 |
-
self.module = module
|
24 |
-
self.num_replicas = num_replicas
|
25 |
-
|
26 |
-
def forward(self, *args, **kwargs):
|
27 |
-
return self.module(*args, **kwargs)
|
28 |
-
|
29 |
-
# Get number of GPUs
|
30 |
-
num_gpus = count_gpus()
|
31 |
-
print(f"Found {num_gpus} GPUs")
|
32 |
-
|
33 |
-
# Choose device map strategy based on GPU count
|
34 |
-
if num_gpus > 1:
|
35 |
-
# For multi-GPU setup, use balanced distribution
|
36 |
-
device_map = "balanced"
|
37 |
-
print(f"Using balanced device mapping across {num_gpus} GPUs")
|
38 |
-
else:
|
39 |
-
# For single GPU, use auto or specific mapping based on memory
|
40 |
-
device_map = "auto"
|
41 |
-
print("Using automatic device mapping")
|
42 |
-
|
43 |
-
# Try the standard loading approach
|
44 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
45 |
-
model = AutoModel.from_pretrained(
|
46 |
-
model_name,
|
47 |
-
trust_remote_code=True,
|
48 |
-
device_map=device_map
|
49 |
-
)
|
50 |
-
|
51 |
-
print(f"Successfully loaded {model_name}")
|
52 |
-
return model, tokenizer
|
53 |
-
except Exception as e:
|
54 |
-
# If the first approach fails, try with module.__dict__
|
55 |
-
try:
|
56 |
-
print(f"First loading approach failed: {str(e)}")
|
57 |
-
print("Trying alternative loading approach...")
|
58 |
-
|
59 |
-
# Import the module
|
60 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
61 |
-
|
62 |
-
# Dynamically get the module
|
63 |
-
model_class = AutoModel._MODEL_MAPPING[AutoModel._model_mapping[model_name]]
|
64 |
-
|
65 |
-
# Add Replicate to the module's namespace
|
66 |
-
model_class.__module_dict__ = {}
|
67 |
-
model_class.__module_dict__["Replicate"] = Replicate
|
68 |
-
|
69 |
-
# Get number of GPUs
|
70 |
-
num_gpus = count_gpus()
|
71 |
-
|
72 |
-
# Choose device map strategy based on GPU count
|
73 |
-
if num_gpus > 1:
|
74 |
-
device_map = "balanced"
|
75 |
-
else:
|
76 |
-
device_map = "auto"
|
77 |
-
|
78 |
-
# Try loading with the augmented namespace
|
79 |
-
model = model_class.from_pretrained(
|
80 |
-
model_name,
|
81 |
-
trust_remote_code=True,
|
82 |
-
device_map=device_map
|
83 |
-
)
|
84 |
-
|
85 |
-
print(f"Successfully loaded {model_name} with alternative approach")
|
86 |
-
return model, tokenizer
|
87 |
-
except Exception as e2:
|
88 |
-
print(f"Alternative loading approach also failed: {str(e2)}")
|
89 |
-
print(f"Could not load embedding model {model_name}")
|
90 |
-
return None, None
|
91 |
-
|
92 |
-
def load_explanation_model(model_name="Qwen/QwQ-32B"):
|
93 |
-
"""Load the explanation model with a try-except approach instead of module patching"""
|
94 |
-
try:
|
95 |
-
print(f"Loading explanation model {model_name}...")
|
96 |
-
|
97 |
-
# Get number of GPUs
|
98 |
-
num_gpus = count_gpus()
|
99 |
-
print(f"Found {num_gpus} GPUs")
|
100 |
-
|
101 |
-
# Choose quantization and device strategy based on GPU count and memory
|
102 |
-
if num_gpus > 1:
|
103 |
-
# For multi-GPU, use 4-bit quantization and balanced distribution
|
104 |
-
quantization_config = BitsAndBytesConfig(
|
105 |
-
load_in_4bit=True,
|
106 |
-
bnb_4bit_quant_type="nf4",
|
107 |
-
bnb_4bit_compute_dtype=torch.float16,
|
108 |
-
bnb_4bit_use_double_quant=True
|
109 |
-
)
|
110 |
-
device_map = "balanced"
|
111 |
-
print(f"Using 4-bit quantization with balanced device mapping across {num_gpus} GPUs")
|
112 |
-
else:
|
113 |
-
# For single GPU, use more aggressive 4-bit quantization
|
114 |
-
quantization_config = BitsAndBytesConfig(
|
115 |
-
load_in_4bit=True,
|
116 |
-
bnb_4bit_quant_type="nf4",
|
117 |
-
bnb_4bit_compute_dtype=torch.float16,
|
118 |
-
bnb_4bit_use_double_quant=True
|
119 |
-
)
|
120 |
-
device_map = "auto"
|
121 |
-
print("Using 4-bit quantization with automatic device mapping")
|
122 |
-
|
123 |
-
# Create a simple Replicate class that may be needed
|
124 |
-
class Replicate(torch.nn.Module):
|
125 |
-
def __init__(self, module, num_replicas=1):
|
126 |
-
super().__init__()
|
127 |
-
self.module = module
|
128 |
-
self.num_replicas = num_replicas
|
129 |
-
|
130 |
-
def forward(self, *args, **kwargs):
|
131 |
-
return self.module(*args, **kwargs)
|
132 |
-
|
133 |
-
# Try the standard loading approach
|
134 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
135 |
-
|
136 |
-
# Check if we have enough resources to load the model
|
137 |
-
if torch.cuda.is_available():
|
138 |
-
total_gpu_memory = sum([torch.cuda.get_device_properties(i).total_memory for i in range(num_gpus)]) / (1024**3)
|
139 |
-
if num_gpus > 1 or total_gpu_memory >= 16: # 16 GB (reduced thanks to quantization)
|
140 |
-
model = AutoModelForCausalLM.from_pretrained(
|
141 |
-
model_name,
|
142 |
-
quantization_config=quantization_config,
|
143 |
-
device_map=device_map,
|
144 |
-
trust_remote_code=True,
|
145 |
-
torch_dtype=torch.float16,
|
146 |
-
max_memory={i: f"{int(torch.cuda.get_device_properties(i).total_memory / (1024**3) * 0.9)}GiB" for i in range(num_gpus)}
|
147 |
-
)
|
148 |
-
print(f"Successfully loaded {model_name}")
|
149 |
-
return model, tokenizer
|
150 |
-
else:
|
151 |
-
print("Not enough GPU memory, using template-based explanations")
|
152 |
-
return None, tokenizer
|
153 |
-
else:
|
154 |
-
print("CUDA not available, using template-based explanations")
|
155 |
-
return None, tokenizer
|
156 |
-
except Exception as e:
|
157 |
-
print(f"Error loading explanation model: {str(e)}")
|
158 |
-
print("Falling back to template-based explanations.")
|
159 |
-
return None, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
import streamlit as st
|
2 |
-
import pdfplumber
|
3 |
import pandas as pd
|
4 |
import numpy as np
|
5 |
import torch
|
@@ -8,59 +7,18 @@ import faiss
|
|
8 |
import os
|
9 |
import tempfile
|
10 |
import base64
|
|
|
|
|
11 |
from rank_bm25 import BM25Okapi
|
12 |
-
from transformers import AutoModel, AutoTokenizer
|
13 |
-
from sentence_transformers import SentenceTransformer
|
14 |
from nltk.tokenize import word_tokenize, sent_tokenize
|
15 |
from tqdm import tqdm
|
16 |
-
import
|
17 |
-
import io
|
18 |
import PyPDF2
|
19 |
from docx import Document
|
20 |
import csv
|
21 |
-
import
|
22 |
-
|
23 |
-
# Use the alternative model loading approach
|
24 |
-
try:
|
25 |
-
# Try to import the functions from alt_models.py
|
26 |
-
from alt_models import load_embedding_model, load_explanation_model
|
27 |
-
USE_ALT_MODELS = True
|
28 |
-
except ImportError:
|
29 |
-
USE_ALT_MODELS = False
|
30 |
-
# If import fails, we'll use the original approach
|
31 |
-
# Add Replicate class workaround
|
32 |
-
class Replicate(torch.nn.Module):
|
33 |
-
"""Workaround class for missing Replicate in NV-Embed and Qwen models"""
|
34 |
-
def __init__(self, module, num_replicas=1):
|
35 |
-
super().__init__()
|
36 |
-
self.module = module
|
37 |
-
self.num_replicas = num_replicas
|
38 |
-
|
39 |
-
def forward(self, *args, **kwargs):
|
40 |
-
return self.module(*args, **kwargs)
|
41 |
-
|
42 |
-
# Create module structure if it doesn't exist yet
|
43 |
-
# Handle NVIDIA module
|
44 |
-
if "transformers.models.nvembed.modeling_nvembed" not in sys.modules:
|
45 |
-
# Create parent modules if they don't exist
|
46 |
-
if "transformers.models.nvembed" not in sys.modules:
|
47 |
-
sys.modules["transformers.models.nvembed"] = type('', (), {})
|
48 |
-
# Create the module we need
|
49 |
-
sys.modules["transformers.models.nvembed.modeling_nvembed"] = type('', (), {})
|
50 |
-
|
51 |
-
# Handle Qwen module
|
52 |
-
if "transformers.models.qwen2.modeling_qwen2" not in sys.modules:
|
53 |
-
# Create parent modules if they don't exist
|
54 |
-
if "transformers.models.qwen2" not in sys.modules:
|
55 |
-
sys.modules["transformers.models.qwen2"] = type('', (), {})
|
56 |
-
# Create the module we need
|
57 |
-
sys.modules["transformers.models.qwen2.modeling_qwen2"] = type('', (), {})
|
58 |
-
|
59 |
-
# Add the class to modules
|
60 |
-
sys.modules["transformers.models.nvembed.modeling_nvembed"].Replicate = Replicate
|
61 |
-
sys.modules["transformers.models.qwen2.modeling_qwen2"].Replicate = Replicate
|
62 |
-
|
63 |
-
from explanation_generator import ExplanationGenerator
|
64 |
|
65 |
# Download NLTK resources
|
66 |
try:
|
@@ -68,49 +26,17 @@ try:
|
|
68 |
except LookupError:
|
69 |
nltk.download('punkt')
|
70 |
|
71 |
-
# Initialize embedding model at startup
|
72 |
-
EMBEDDING_MODEL_NAME = "nvidia/NV-Embed-v2"
|
73 |
-
|
74 |
-
if USE_ALT_MODELS:
|
75 |
-
# Use the alternative loading approach
|
76 |
-
global_embedding_model, global_embedding_tokenizer = load_embedding_model(EMBEDDING_MODEL_NAME)
|
77 |
-
else:
|
78 |
-
# Use the original approach
|
79 |
-
print(f"Loading embedding model {EMBEDDING_MODEL_NAME}...")
|
80 |
-
try:
|
81 |
-
# Load embedding model and tokenizer
|
82 |
-
global_embedding_tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME, trust_remote_code=True)
|
83 |
-
global_embedding_model = AutoModel.from_pretrained(EMBEDDING_MODEL_NAME, trust_remote_code=True, device_map="auto")
|
84 |
-
print(f"Successfully loaded {EMBEDDING_MODEL_NAME}")
|
85 |
-
except Exception as e:
|
86 |
-
print(f"Error loading embedding model: {str(e)}")
|
87 |
-
global_embedding_tokenizer = None
|
88 |
-
global_embedding_model = None
|
89 |
-
|
90 |
# Set page configuration
|
91 |
st.set_page_config(
|
92 |
-
page_title="Resume Screener
|
93 |
-
page_icon="
|
94 |
layout="wide",
|
95 |
initial_sidebar_state="expanded"
|
96 |
)
|
97 |
|
98 |
-
# Sidebar
|
99 |
with st.sidebar:
|
100 |
-
st.title("Configuration")
|
101 |
-
|
102 |
-
# Model selection
|
103 |
-
embedding_model_name = st.selectbox(
|
104 |
-
"Embedding Model",
|
105 |
-
["nvidia/NV-Embed-v2"],
|
106 |
-
index=0
|
107 |
-
)
|
108 |
-
|
109 |
-
explanation_model_name = st.selectbox(
|
110 |
-
"Explanation Model",
|
111 |
-
["Qwen/Qwen3-14B"],
|
112 |
-
index=0
|
113 |
-
)
|
114 |
|
115 |
# Ranking weights
|
116 |
st.subheader("Ranking Weights")
|
@@ -120,304 +46,202 @@ with st.sidebar:
|
|
120 |
|
121 |
# Advanced options
|
122 |
st.subheader("Advanced Options")
|
123 |
-
top_k = st.number_input("Number of results to display", min_value=1, max_value=
|
124 |
-
use_explanation = st.checkbox("Generate Explanations", value=True)
|
125 |
-
use_faiss = st.checkbox("Use FAISS for fast search", value=True)
|
126 |
-
|
127 |
-
# Memory optimization options
|
128 |
-
st.subheader("Memory Optimization")
|
129 |
-
memory_optimization = st.checkbox("Enable memory optimization (for large datasets)", value=False)
|
130 |
-
clear_embeddings = st.checkbox("Clear embeddings after processing", value=False)
|
131 |
-
gc_collect_interval = st.number_input(
|
132 |
-
"Garbage collection interval (files)",
|
133 |
-
min_value=10,
|
134 |
-
max_value=1000,
|
135 |
-
value=100,
|
136 |
-
step=10,
|
137 |
-
help="Run garbage collection after processing this many files"
|
138 |
-
)
|
139 |
|
140 |
st.markdown("---")
|
141 |
-
st.markdown("###
|
142 |
-
st.markdown("
|
|
|
|
|
|
|
143 |
|
144 |
-
# Initialize session state
|
145 |
-
if '
|
146 |
-
st.session_state.
|
147 |
-
if '
|
148 |
-
st.session_state.
|
149 |
if 'results' not in st.session_state:
|
150 |
st.session_state.results = []
|
151 |
-
if 'embedding_model' not in st.session_state:
|
152 |
-
st.session_state.embedding_model = global_embedding_model
|
153 |
-
if 'tokenizer' not in st.session_state:
|
154 |
-
st.session_state.tokenizer = global_embedding_tokenizer
|
155 |
-
if 'faiss_index' not in st.session_state:
|
156 |
-
st.session_state.faiss_index = None
|
157 |
-
if 'explanation_generator' not in st.session_state:
|
158 |
-
st.session_state.explanation_generator = None
|
159 |
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
|
180 |
-
def extract_text_from_file(self,
|
181 |
"""Extract text from various file types"""
|
182 |
try:
|
183 |
if file_type == "pdf":
|
184 |
-
|
185 |
-
|
186 |
-
text = ""
|
187 |
-
for page in pdf.pages:
|
188 |
-
text += page.extract_text() or ""
|
189 |
-
|
190 |
-
# If pdfplumber fails, try PyPDF2 as fallback
|
191 |
-
if not text.strip():
|
192 |
-
reader = PyPDF2.PdfReader(file)
|
193 |
text = ""
|
194 |
-
for
|
195 |
-
page = reader.pages[page_num]
|
196 |
text += page.extract_text() or ""
|
197 |
-
|
198 |
-
|
199 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
elif file_type == "docx":
|
201 |
-
doc = Document(
|
202 |
return " ".join([paragraph.text for paragraph in doc.paragraphs])
|
203 |
|
204 |
elif file_type == "txt":
|
205 |
-
|
206 |
-
|
|
|
207 |
elif file_type == "csv":
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
return csv_text
|
213 |
-
|
214 |
-
else:
|
215 |
-
st.error(f"Unsupported file type: {file_type}")
|
216 |
-
return ""
|
217 |
-
|
218 |
except Exception as e:
|
219 |
-
st.error(f"Error extracting text from
|
220 |
return ""
|
221 |
|
222 |
def get_embedding(self, text):
|
223 |
-
"""Generate
|
224 |
-
if self.
|
225 |
-
|
226 |
-
return np.zeros(768) # Default embedding size as fallback
|
227 |
|
228 |
try:
|
229 |
-
#
|
230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
|
232 |
-
# Move
|
233 |
-
device = next(self.
|
234 |
inputs = {k: v.to(device) for k, v in inputs.items()}
|
235 |
|
236 |
with torch.no_grad():
|
237 |
-
outputs = self.
|
238 |
-
|
239 |
-
# Handle specific case for NV-Embed-v2 which returns a nested structure
|
240 |
-
if self.embedding_model_name == "nvidia/NV-Embed-v2":
|
241 |
-
# Access the embedding from the NV-Embed specific output format
|
242 |
-
if hasattr(outputs, "pooler_output"):
|
243 |
-
embeddings = outputs.pooler_output
|
244 |
-
embedding_np = embeddings.cpu().detach().numpy()
|
245 |
-
if self.embedding_size is None:
|
246 |
-
self.embedding_size = embedding_np.shape[1]
|
247 |
-
return embedding_np[0] # Return the first embedding
|
248 |
-
# Try to handle multi-level dictionary if the model changed output format
|
249 |
-
elif isinstance(outputs, dict) and "embedding" in outputs:
|
250 |
-
embeddings = outputs["embedding"]
|
251 |
-
embedding_np = embeddings.cpu().detach().numpy()
|
252 |
-
if self.embedding_size is None:
|
253 |
-
self.embedding_size = embedding_np.shape[1]
|
254 |
-
return embedding_np[0]
|
255 |
-
|
256 |
-
# Handle different output structures
|
257 |
-
if hasattr(outputs, "last_hidden_state"):
|
258 |
-
# Mean pooling across token dimension
|
259 |
-
embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
|
260 |
-
embedding_np = embeddings.cpu().detach().numpy()
|
261 |
-
|
262 |
-
# Set embedding size if not set
|
263 |
-
if self.embedding_size is None:
|
264 |
-
self.embedding_size = embedding_np.shape[0]
|
265 |
-
|
266 |
-
return embedding_np
|
267 |
-
elif isinstance(outputs, dict) and "embeddings" in outputs:
|
268 |
-
# For models that return a dictionary with embeddings
|
269 |
-
embeddings = outputs["embeddings"]
|
270 |
-
embedding_np = embeddings.cpu().detach().numpy()
|
271 |
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
elif isinstance(outputs, torch.Tensor):
|
278 |
-
# For models that return a tensor directly
|
279 |
-
embedding_np = outputs.cpu().detach().numpy()
|
280 |
-
|
281 |
-
# Set embedding size if not set
|
282 |
-
if self.embedding_size is None:
|
283 |
-
self.embedding_size = embedding_np.shape[-1]
|
284 |
-
|
285 |
-
return embedding_np.squeeze()
|
286 |
else:
|
287 |
-
|
288 |
-
st.warning(f"Unexpected output structure from model: {type(outputs)}")
|
289 |
-
if hasattr(outputs, "__dict__"):
|
290 |
-
for attr_name in dir(outputs):
|
291 |
-
if not attr_name.startswith('_'):
|
292 |
-
attr = getattr(outputs, attr_name)
|
293 |
-
if isinstance(attr, torch.Tensor):
|
294 |
-
st.info(f"Found tensor attribute '{attr_name}' with shape {attr.shape}")
|
295 |
-
embedding_np = attr.cpu().detach().numpy()
|
296 |
-
if self.embedding_size is None:
|
297 |
-
self.embedding_size = embedding_np.shape[-1]
|
298 |
-
return embedding_np.squeeze()
|
299 |
|
300 |
-
|
301 |
-
|
302 |
-
self.embedding_size = 768 # Default size
|
303 |
-
return np.zeros(self.embedding_size)
|
304 |
except Exception as e:
|
305 |
st.error(f"Error generating embedding: {str(e)}")
|
306 |
-
|
307 |
-
self.embedding_size = 768 # Default size
|
308 |
-
return np.zeros(self.embedding_size)
|
309 |
-
|
310 |
-
def create_faiss_index(self, embeddings):
|
311 |
-
"""Create a FAISS index for fast similarity search"""
|
312 |
-
# Get the dimension of the embeddings
|
313 |
-
dimension = embeddings[0].shape[0]
|
314 |
-
|
315 |
-
# Create a FAISS index
|
316 |
-
index = faiss.IndexFlatIP(dimension) # Inner product for cosine similarity with normalized vectors
|
317 |
-
|
318 |
-
# Add normalized vectors to the index
|
319 |
-
embeddings_normalized = np.vstack([emb / np.linalg.norm(emb) for emb in embeddings])
|
320 |
-
index.add(embeddings_normalized)
|
321 |
-
|
322 |
-
return index
|
323 |
-
|
324 |
-
def query_faiss_index(self, index, query_embedding, k=10):
|
325 |
-
"""Query the FAISS index with a query embedding"""
|
326 |
-
# Normalize query embedding
|
327 |
-
query_embedding = query_embedding / np.linalg.norm(query_embedding)
|
328 |
-
|
329 |
-
# Reshape to a row vector if needed
|
330 |
-
if len(query_embedding.shape) == 1:
|
331 |
-
query_embedding = query_embedding.reshape(1, -1)
|
332 |
-
|
333 |
-
# Query the index
|
334 |
-
scores, indices = index.search(query_embedding, k)
|
335 |
-
|
336 |
-
return scores[0], indices[0] # Return the scores and indices as flat arrays
|
337 |
|
338 |
def calculate_bm25_scores(self, resume_texts, job_description):
|
339 |
"""Calculate BM25 scores for keyword matching"""
|
340 |
-
# Tokenize job description
|
341 |
-
job_tokens = word_tokenize(job_description.lower())
|
342 |
-
|
343 |
-
# Prepare corpus from resumes
|
344 |
-
corpus = [word_tokenize(resume.lower()) for resume in resume_texts]
|
345 |
-
|
346 |
-
# Check if corpus is empty
|
347 |
-
if not corpus or len(corpus) == 0:
|
348 |
-
st.error("No resume texts provided for BM25 calculation")
|
349 |
-
return [0.0] * len(resume_texts)
|
350 |
-
|
351 |
-
# Check for empty documents in corpus
|
352 |
-
filtered_corpus = [doc for doc in corpus if len(doc) > 0]
|
353 |
-
if not filtered_corpus:
|
354 |
-
st.error("All resume texts are empty after tokenization")
|
355 |
-
return [0.0] * len(resume_texts)
|
356 |
-
|
357 |
-
# Initialize BM25
|
358 |
try:
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
if len(corpus[i]) > 0:
|
370 |
-
full_scores.append(scores[filtered_idx])
|
371 |
-
filtered_idx += 1
|
372 |
-
else:
|
373 |
-
full_scores.append(0.0)
|
374 |
-
return full_scores
|
375 |
-
else:
|
376 |
-
return scores
|
377 |
except Exception as e:
|
378 |
-
st.error(f"Error
|
379 |
return [0.0] * len(resume_texts)
|
380 |
|
381 |
-
def calculate_hybrid_scores(self, resume_texts,
|
382 |
-
"""Calculate hybrid scores combining semantic
|
383 |
-
#
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
# Direct cosine similarity calculation for smaller datasets
|
402 |
-
semantic_scores = []
|
403 |
-
for emb in resume_embeddings:
|
404 |
-
# Normalize the embeddings for cosine similarity
|
405 |
-
emb_norm = emb / np.linalg.norm(emb)
|
406 |
-
job_emb_norm = job_embedding / np.linalg.norm(job_embedding)
|
407 |
-
|
408 |
-
# Calculate cosine similarity
|
409 |
-
similarity = np.dot(emb_norm, job_emb_norm)
|
410 |
-
semantic_scores.append(similarity)
|
411 |
|
412 |
# Calculate BM25 scores
|
413 |
bm25_scores = self.calculate_bm25_scores(resume_texts, job_description)
|
414 |
|
415 |
# Normalize BM25 scores
|
416 |
-
if max(bm25_scores) > 0:
|
417 |
-
|
|
|
418 |
|
419 |
# Calculate hybrid scores
|
420 |
-
keyword_weight = 1.0 - semantic_weight
|
421 |
hybrid_scores = [
|
422 |
(semantic_weight * sem_score) + (keyword_weight * bm25_score)
|
423 |
for sem_score, bm25_score in zip(semantic_scores, bm25_scores)
|
@@ -426,682 +250,423 @@ class ResumeScreener:
|
|
426 |
return hybrid_scores, semantic_scores, bm25_scores
|
427 |
|
428 |
def extract_skills(self, text, job_description):
|
429 |
-
"""Extract skills from
|
430 |
-
#
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
"skills in", "expertise in", "background in", "capabilities in",
|
439 |
-
"years of experience in", "understanding of", "trained in"]
|
440 |
-
|
441 |
-
# Extract skills from sentences containing skill indicators
|
442 |
-
sentences = sent_tokenize(job_description)
|
443 |
-
for sentence in sentences:
|
444 |
-
sentence_lower = sentence.lower()
|
445 |
-
for indicator in skill_indicators:
|
446 |
-
if indicator in sentence_lower:
|
447 |
-
# Extract words after the indicator, possibly until end of sentence or punctuation
|
448 |
-
skills_part = sentence_lower.split(indicator, 1)[1]
|
449 |
-
|
450 |
-
# Extract words, cleaning up symbols
|
451 |
-
words = re.findall(r'\b[a-zA-Z0-9+#/.]+\b', skills_part)
|
452 |
-
for word in words:
|
453 |
-
if len(word) >= 3: # Only consider words 3 letters or longer
|
454 |
-
potential_skills.add(word.lower())
|
455 |
|
456 |
-
#
|
457 |
-
|
458 |
-
for skill_list in skill_lists:
|
459 |
-
words = re.findall(r'\b[a-zA-Z0-9+#/.]+\b', skill_list)
|
460 |
-
for word in words:
|
461 |
-
if len(word) >= 3:
|
462 |
-
potential_skills.add(word.lower())
|
463 |
|
464 |
-
#
|
465 |
-
|
466 |
-
|
467 |
-
"git", "ci/cd", "agile", "scrum", "rest", "graphql", "ml", "ai", "data science"]
|
468 |
|
469 |
-
|
470 |
-
|
471 |
-
|
|
|
472 |
|
473 |
-
#
|
474 |
-
|
475 |
-
|
476 |
-
|
477 |
-
pattern = r'\b' + re.escape(skill) + r'\b'
|
478 |
-
matches = re.findall(pattern, text.lower())
|
479 |
-
if matches:
|
480 |
-
matched_skills.append(skill)
|
481 |
|
482 |
-
return list(set(
|
483 |
-
|
484 |
-
def extract_key_phrases(self, text, job_description):
|
485 |
-
"""Extract key phrases from text that match job description keywords"""
|
486 |
-
# Identify job skills first
|
487 |
-
skills = self.extract_skills(job_description, job_description)
|
488 |
-
|
489 |
-
# Extract sentences that contain skills
|
490 |
-
sentences = sent_tokenize(text)
|
491 |
-
skill_sentences = []
|
492 |
-
|
493 |
-
for sentence in sentences:
|
494 |
-
sentence_lower = sentence.lower()
|
495 |
-
for skill in skills:
|
496 |
-
if skill in sentence_lower:
|
497 |
-
# Append the sentence with the skill highlighted
|
498 |
-
highlighted = sentence.replace(skill, f"**{skill}**")
|
499 |
-
skill_sentences.append(highlighted)
|
500 |
-
break
|
501 |
-
|
502 |
-
# Get additional generic matches if we don't have enough skill sentences
|
503 |
-
if len(skill_sentences) < 5:
|
504 |
-
# Simple extraction based on job description keywords
|
505 |
-
job_tokens = set(word.lower() for word in word_tokenize(job_description) if len(word) > 3)
|
506 |
-
text_tokens = word_tokenize(text)
|
507 |
-
|
508 |
-
matches = []
|
509 |
-
for i, token in enumerate(text_tokens):
|
510 |
-
if token.lower() in job_tokens:
|
511 |
-
# Get a phrase context (5 words before and after)
|
512 |
-
start = max(0, i - 5)
|
513 |
-
end = min(len(text_tokens), i + 6)
|
514 |
-
phrase = " ".join(text_tokens[start:end])
|
515 |
-
matches.append(phrase)
|
516 |
-
|
517 |
-
# Add unique phrases to complement skill sentences
|
518 |
-
unique_matches = list(set(matches))
|
519 |
-
skill_sentences.extend(unique_matches[:5 - len(skill_sentences)])
|
520 |
-
|
521 |
-
# Return unique phrases, up to 5
|
522 |
-
return skill_sentences[:5]
|
523 |
|
524 |
def generate_explanation(self, resume_text, job_description, score, semantic_score, bm25_score, skills):
|
525 |
-
"""Generate explanation
|
526 |
-
|
527 |
-
|
528 |
-
|
529 |
-
|
530 |
-
|
531 |
-
|
532 |
-
|
533 |
-
|
534 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
535 |
)
|
536 |
-
else:
|
537 |
-
# Fallback to simple explanation
|
538 |
-
matching_phrases = self.extract_key_phrases(resume_text, job_description)
|
539 |
|
540 |
-
|
541 |
|
542 |
-
|
543 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
544 |
|
545 |
-
|
546 |
-
explanation += f"Key matching elements include: {matching_phrases[0]}"
|
547 |
|
548 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
549 |
|
550 |
-
|
551 |
-
|
552 |
csv = df.to_csv(index=False)
|
553 |
b64 = base64.b64encode(csv.encode()).decode()
|
554 |
-
|
555 |
-
return href
|
556 |
|
557 |
-
#
|
558 |
-
|
559 |
-
|
560 |
-
datasets = []
|
561 |
-
|
562 |
-
# Common dataset paths in Hugging Face Spaces
|
563 |
-
potential_paths = [
|
564 |
-
"/data", # Common mount point
|
565 |
-
"data", # Relative path
|
566 |
-
os.path.expanduser("~/data"), # Home directory
|
567 |
-
]
|
568 |
-
|
569 |
-
for path in potential_paths:
|
570 |
-
if os.path.exists(path) and os.path.isdir(path):
|
571 |
-
# Look for CSV files
|
572 |
-
csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]
|
573 |
-
for csv_file in csv_files:
|
574 |
-
datasets.append(os.path.join(path, csv_file))
|
575 |
-
|
576 |
-
# Look for directories that might contain PDFs
|
577 |
-
for subdir in os.listdir(path):
|
578 |
-
subdir_path = os.path.join(path, subdir)
|
579 |
-
if os.path.isdir(subdir_path):
|
580 |
-
pdf_count = len([f for f in os.listdir(subdir_path) if f.lower().endswith('.pdf')])
|
581 |
-
if pdf_count > 0:
|
582 |
-
datasets.append((subdir_path, f"PDF Directory ({pdf_count} files)"))
|
583 |
-
|
584 |
-
return datasets
|
585 |
-
|
586 |
-
# Main app UI
|
587 |
-
st.title("Resume Screener & Skill Extractor")
|
588 |
st.markdown("---")
|
589 |
|
590 |
-
# Initialize
|
591 |
-
|
|
|
|
|
|
|
|
|
|
|
592 |
|
593 |
-
# Job
|
594 |
-
st.header("1
|
595 |
job_description = st.text_area(
|
596 |
-
"
|
597 |
-
height=
|
598 |
-
|
599 |
)
|
600 |
|
601 |
-
# Resume
|
602 |
-
st.header("2
|
603 |
-
|
604 |
-
|
605 |
-
|
|
|
606 |
)
|
607 |
|
608 |
-
uploaded_files = []
|
609 |
resume_texts = []
|
610 |
file_names = []
|
611 |
|
612 |
-
if
|
613 |
uploaded_files = st.file_uploader(
|
614 |
"Upload resume files",
|
615 |
-
type=["pdf", "docx", "txt"
|
616 |
accept_multiple_files=True,
|
617 |
-
help="
|
618 |
)
|
619 |
|
620 |
if uploaded_files:
|
621 |
-
with st.spinner("Processing
|
622 |
for file in uploaded_files:
|
623 |
file_type = file.name.split('.')[-1].lower()
|
624 |
|
|
|
625 |
with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{file_type}') as tmp_file:
|
626 |
tmp_file.write(file.getvalue())
|
627 |
tmp_path = tmp_file.name
|
628 |
|
|
|
629 |
text = screener.extract_text_from_file(tmp_path, file_type)
|
630 |
-
if text:
|
631 |
resume_texts.append(text)
|
632 |
file_names.append(file.name)
|
633 |
|
634 |
-
#
|
635 |
os.unlink(tmp_path)
|
636 |
-
|
637 |
-
|
638 |
-
|
639 |
-
|
640 |
-
|
641 |
-
|
642 |
-
# Input for directory path
|
643 |
-
resume_dir = st.text_input(
|
644 |
-
"Enter the path to the directory containing resume files:",
|
645 |
-
help="For Hugging Face Spaces, this could be a mounted directory or dataset."
|
646 |
-
)
|
647 |
-
|
648 |
-
# Limit batch size
|
649 |
-
batch_size = st.number_input(
|
650 |
-
"Number of files to process per batch (lower for less memory usage):",
|
651 |
-
min_value=10,
|
652 |
-
max_value=1000,
|
653 |
-
value=100,
|
654 |
-
step=10
|
655 |
-
)
|
656 |
-
|
657 |
-
# File types to process
|
658 |
-
file_types = st.multiselect(
|
659 |
-
"Select file types to process:",
|
660 |
-
["pdf", "docx", "txt", "csv"],
|
661 |
-
default=["pdf"]
|
662 |
-
)
|
663 |
|
664 |
-
if
|
665 |
-
|
666 |
-
|
667 |
-
|
668 |
-
|
669 |
-
all_files.extend([
|
670 |
-
os.path.join(resume_dir, f)
|
671 |
-
for f in os.listdir(resume_dir)
|
672 |
-
if f.lower().endswith(f'.{file_type}')
|
673 |
-
])
|
674 |
|
675 |
-
|
676 |
-
|
677 |
-
|
678 |
-
|
679 |
-
# Process in batches
|
680 |
-
processed_count = 0
|
681 |
-
progress_bar = st.progress(0)
|
682 |
-
status_text = st.empty()
|
683 |
-
|
684 |
-
for i in range(0, total_files, batch_size):
|
685 |
-
batch_files = all_files[i:i+batch_size]
|
686 |
-
|
687 |
-
for j, file_path in enumerate(batch_files):
|
688 |
-
try:
|
689 |
-
file_type = file_path.split('.')[-1].lower()
|
690 |
-
text = screener.extract_text_from_file(file_path, file_type)
|
691 |
-
if text:
|
692 |
-
resume_texts.append(text)
|
693 |
-
file_names.append(os.path.basename(file_path))
|
694 |
-
processed_count += 1
|
695 |
-
|
696 |
-
# Apply memory optimization if enabled
|
697 |
-
if memory_optimization and j % gc_collect_interval == 0 and j > 0:
|
698 |
-
import gc
|
699 |
-
gc.collect()
|
700 |
-
status_text.text(f"Processed {processed_count}/{total_files} files... (ran GC)")
|
701 |
-
except Exception as e:
|
702 |
-
st.warning(f"Error processing {file_path}: {str(e)}")
|
703 |
-
|
704 |
-
# Update progress
|
705 |
-
progress = min(1.0, (i + len(batch_files)) / total_files)
|
706 |
-
progress_bar.progress(progress)
|
707 |
-
status_text.text(f"Processed {processed_count}/{total_files} files...")
|
708 |
-
|
709 |
-
# Run garbage collection between batches if memory optimization is enabled
|
710 |
-
if memory_optimization:
|
711 |
-
import gc
|
712 |
-
gc.collect()
|
713 |
-
|
714 |
-
# Final garbage collection if memory optimization is enabled
|
715 |
-
if memory_optimization:
|
716 |
-
import gc
|
717 |
-
gc.collect()
|
718 |
-
|
719 |
-
st.session_state.resumes_uploaded = True
|
720 |
-
st.success(f"Successfully processed {processed_count} out of {total_files} resume files.")
|
721 |
-
else:
|
722 |
-
st.error(f"No matching files found in {resume_dir}")
|
723 |
-
else:
|
724 |
-
st.error(f"Directory {resume_dir} does not exist or is not accessible.")
|
725 |
-
elif upload_option == "Upload from Dataset":
|
726 |
-
# Upload from Dataset implementation
|
727 |
-
st.write("Upload a CSV file containing resume data or load from available datasets.")
|
728 |
-
|
729 |
-
# Check for available datasets in Hugging Face Spaces
|
730 |
-
hf_datasets = get_huggingface_spaces_datasets()
|
731 |
-
|
732 |
-
if hf_datasets:
|
733 |
-
st.subheader("Available Datasets in Hugging Face Spaces")
|
734 |
-
dataset_options = ["None"] + [os.path.basename(ds) if isinstance(ds, str) else f"{os.path.basename(ds[0])} ({ds[1]})" for ds in hf_datasets]
|
735 |
-
selected_dataset = st.selectbox("Select a dataset:", dataset_options)
|
736 |
-
|
737 |
-
if selected_dataset != "None":
|
738 |
-
selected_index = dataset_options.index(selected_dataset) - 1 # Adjust for "None"
|
739 |
-
dataset_path = hf_datasets[selected_index]
|
740 |
|
741 |
-
|
742 |
-
|
743 |
-
|
744 |
-
|
745 |
-
|
746 |
-
|
747 |
-
|
748 |
-
|
749 |
-
|
750 |
-
|
751 |
-
|
752 |
-
)
|
753 |
-
|
754 |
-
if st.button("Process PDF Directory"):
|
755 |
-
# Use the same processing logic as in the "Process Directory" option
|
756 |
-
if os.path.isdir(pdf_dir):
|
757 |
-
all_files = [
|
758 |
-
os.path.join(pdf_dir, f)
|
759 |
-
for f in os.listdir(pdf_dir)
|
760 |
-
if f.lower().endswith('.pdf')
|
761 |
-
]
|
762 |
-
|
763 |
-
if all_files:
|
764 |
-
total_files = len(all_files)
|
765 |
-
st.write(f"Found {total_files} PDF files. Processing in batches of {batch_size}...")
|
766 |
-
|
767 |
-
# Process in batches
|
768 |
-
processed_count = 0
|
769 |
-
progress_bar = st.progress(0)
|
770 |
-
status_text = st.empty()
|
771 |
-
|
772 |
-
for i in range(0, total_files, batch_size):
|
773 |
-
batch_files = all_files[i:i+batch_size]
|
774 |
-
|
775 |
-
for j, file_path in enumerate(batch_files):
|
776 |
-
try:
|
777 |
-
text = screener.extract_text_from_file(file_path, "pdf")
|
778 |
-
if text:
|
779 |
-
resume_texts.append(text)
|
780 |
-
file_names.append(os.path.basename(file_path))
|
781 |
-
processed_count += 1
|
782 |
-
|
783 |
-
# Apply memory optimization if enabled
|
784 |
-
if memory_optimization and j % gc_collect_interval == 0 and j > 0:
|
785 |
-
import gc
|
786 |
-
gc.collect()
|
787 |
-
except Exception as e:
|
788 |
-
st.warning(f"Error processing {file_path}: {str(e)}")
|
789 |
-
|
790 |
-
# Update progress
|
791 |
-
progress = min(1.0, (i + len(batch_files)) / total_files)
|
792 |
-
progress_bar.progress(progress)
|
793 |
-
status_text.text(f"Processed {processed_count}/{total_files} files...")
|
794 |
-
|
795 |
-
# Memory optimization
|
796 |
-
if memory_optimization:
|
797 |
-
import gc
|
798 |
-
gc.collect()
|
799 |
|
800 |
-
|
801 |
-
|
802 |
-
|
803 |
-
|
804 |
-
st.write(f"Selected CSV dataset: {dataset_path}")
|
805 |
|
806 |
-
|
807 |
-
|
808 |
-
df = pd.read_csv(dataset_path)
|
809 |
|
810 |
-
|
811 |
-
|
812 |
-
|
813 |
-
|
814 |
-
|
815 |
-
|
816 |
-
|
817 |
-
|
818 |
-
|
819 |
-
|
820 |
-
|
821 |
-
|
822 |
-
# Use index as filename if no filename column
|
823 |
-
file_name = f"resume_{i}.txt"
|
824 |
-
if 'filename' in df.columns:
|
825 |
-
file_name = row['filename']
|
826 |
-
file_names.append(file_name)
|
827 |
-
|
828 |
-
st.session_state.resumes_uploaded = True
|
829 |
-
st.success(f"Successfully processed {len(resume_texts)} resumes from CSV.")
|
830 |
-
except Exception as e:
|
831 |
-
st.error(f"Error processing CSV: {str(e)}")
|
832 |
-
|
833 |
-
# Rest of the existing Upload from Dataset code
|
834 |
-
dataset_option = st.radio(
|
835 |
-
"Dataset source:",
|
836 |
-
["Upload CSV", "Use Hugging Face Dataset"]
|
837 |
-
)
|
838 |
-
|
839 |
-
if dataset_option == "Upload CSV":
|
840 |
-
csv_file = st.file_uploader(
|
841 |
-
"Upload CSV file containing resume data",
|
842 |
-
type=["csv"],
|
843 |
-
help="CSV should contain at least a column with resume text."
|
844 |
)
|
845 |
-
|
846 |
-
|
847 |
-
|
848 |
-
|
849 |
-
|
|
|
|
|
850 |
|
851 |
-
|
852 |
-
|
853 |
-
|
854 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
855 |
)
|
856 |
-
|
857 |
-
|
858 |
-
|
859 |
-
|
860 |
-
|
861 |
-
|
862 |
-
resume_texts.append(text)
|
863 |
-
# Use index as filename if no filename column
|
864 |
-
file_name = f"resume_{i}.txt"
|
865 |
-
if 'filename' in df.columns:
|
866 |
-
file_name = row['filename']
|
867 |
-
file_names.append(file_name)
|
868 |
-
|
869 |
-
st.session_state.resumes_uploaded = True
|
870 |
-
st.success(f"Successfully processed {len(resume_texts)} resumes from CSV.")
|
871 |
-
else:
|
872 |
-
# Hugging Face Dataset option
|
873 |
-
dataset_name = st.text_input("Enter Hugging Face dataset name (e.g., 'user/resume_dataset'):")
|
874 |
-
split = st.text_input("Enter dataset split (e.g., 'train'):", "train")
|
875 |
-
|
876 |
-
if dataset_name and st.button("Load Dataset"):
|
877 |
-
with st.spinner(f"Loading dataset {dataset_name}..."):
|
878 |
-
try:
|
879 |
-
from datasets import load_dataset
|
880 |
|
881 |
-
#
|
882 |
-
|
|
|
883 |
|
884 |
-
#
|
885 |
-
|
886 |
|
887 |
-
|
888 |
-
|
889 |
-
|
890 |
-
"Select column containing resume text:",
|
891 |
-
dataset.column_names
|
892 |
-
)
|
893 |
|
894 |
-
if
|
895 |
-
|
896 |
-
for i, item in enumerate(dataset):
|
897 |
-
if text_column in item:
|
898 |
-
text = str(item[text_column])
|
899 |
-
if text:
|
900 |
-
resume_texts.append(text)
|
901 |
-
# Use index or id field as filename
|
902 |
-
file_name = f"resume_{i}.txt"
|
903 |
-
if 'id' in item:
|
904 |
-
file_name = f"resume_{item['id']}.txt"
|
905 |
-
file_names.append(file_name)
|
906 |
|
907 |
-
|
908 |
-
|
909 |
-
|
910 |
-
|
911 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
912 |
|
913 |
-
#
|
914 |
-
if st.button("Find
|
915 |
-
|
916 |
-
|
917 |
-
|
918 |
-
|
919 |
-
|
920 |
-
|
921 |
-
|
922 |
-
|
923 |
-
|
924 |
-
|
925 |
-
|
926 |
-
# Process in batches of 500 resumes
|
927 |
-
batch_size = 500
|
928 |
-
all_hybrid_scores = []
|
929 |
-
all_semantic_scores = []
|
930 |
-
all_bm25_scores = []
|
931 |
-
|
932 |
-
# Calculate BM25 scores first (doesn't require GPU)
|
933 |
-
bm25_scores = screener.calculate_bm25_scores(resume_texts, job_description)
|
934 |
|
935 |
-
#
|
936 |
-
|
937 |
-
for i
|
938 |
-
|
939 |
-
|
940 |
-
|
941 |
-
|
942 |
-
|
943 |
-
|
944 |
-
|
945 |
-
|
946 |
-
|
947 |
-
|
948 |
-
|
949 |
-
progress = (i + j + 1) / len(resume_texts)
|
950 |
-
progress_bar.progress(progress)
|
951 |
-
|
952 |
-
# Calculate semantic scores for this batch
|
953 |
-
batch_semantic_scores = []
|
954 |
-
for emb in batch_embeddings:
|
955 |
-
# Normalize the embeddings for cosine similarity
|
956 |
-
emb_norm = emb / np.linalg.norm(emb)
|
957 |
-
job_emb_norm = job_embedding / np.linalg.norm(job_embedding)
|
958 |
-
|
959 |
-
# Calculate cosine similarity
|
960 |
-
similarity = np.dot(emb_norm, job_emb_norm)
|
961 |
-
batch_semantic_scores.append(similarity)
|
962 |
-
|
963 |
-
# Store scores for this batch
|
964 |
-
all_semantic_scores.extend(batch_semantic_scores)
|
965 |
|
966 |
-
|
967 |
-
|
968 |
-
|
969 |
-
|
970 |
-
|
971 |
-
|
972 |
-
|
|
|
|
|
|
|
973 |
|
974 |
-
#
|
975 |
-
|
976 |
-
bm25_scores = [score / max(bm25_scores) for score in bm25_scores]
|
977 |
|
978 |
-
#
|
979 |
-
|
980 |
-
|
981 |
-
for sem_score, bm25_score in zip(semantic_scores, bm25_scores)
|
982 |
-
]
|
983 |
-
else:
|
984 |
-
# Regular processing for smaller datasets
|
985 |
-
# Get resume embeddings
|
986 |
-
resume_embeddings = []
|
987 |
-
progress_bar = st.progress(0)
|
988 |
-
for i, text in enumerate(resume_texts):
|
989 |
-
embedding = screener.get_embedding(text)
|
990 |
-
resume_embeddings.append(embedding)
|
991 |
-
progress_bar.progress((i + 1) / len(resume_texts))
|
992 |
-
|
993 |
-
# Calculate hybrid scores
|
994 |
-
hybrid_scores, semantic_scores, bm25_scores = screener.calculate_hybrid_scores(
|
995 |
-
resume_texts,
|
996 |
-
resume_embeddings,
|
997 |
-
job_embedding,
|
998 |
-
semantic_weight,
|
999 |
-
use_faiss
|
1000 |
-
)
|
1001 |
-
|
1002 |
-
# Get top candidates
|
1003 |
-
combined_data = list(zip(file_names, resume_texts, hybrid_scores, semantic_scores, bm25_scores))
|
1004 |
-
sorted_data = sorted(combined_data, key=lambda x: x[2], reverse=True)
|
1005 |
-
top_candidates = sorted_data[:int(top_k)]
|
1006 |
-
|
1007 |
-
# Create results with explanations if enabled
|
1008 |
-
results = []
|
1009 |
-
for name, text, score, semantic_score, bm25_score in top_candidates:
|
1010 |
-
# Extract skills for this resume
|
1011 |
-
skills = screener.extract_skills(text, job_description)
|
1012 |
|
1013 |
-
|
1014 |
-
|
1015 |
-
"score": score,
|
1016 |
-
"semantic_score": semantic_score,
|
1017 |
-
"keyword_score": bm25_score,
|
1018 |
-
"text_preview": text[:500] + "...",
|
1019 |
-
"matched_phrases": screener.extract_key_phrases(text, job_description),
|
1020 |
-
"skills": skills
|
1021 |
-
}
|
1022 |
|
1023 |
-
|
1024 |
-
explanation = screener.generate_explanation(
|
1025 |
-
text,
|
1026 |
-
job_description,
|
1027 |
-
score,
|
1028 |
-
semantic_score,
|
1029 |
-
bm25_score,
|
1030 |
-
skills
|
1031 |
-
)
|
1032 |
-
result["explanation"] = explanation
|
1033 |
-
else:
|
1034 |
-
result["explanation"] = ""
|
1035 |
-
|
1036 |
-
results.append(result)
|
1037 |
-
|
1038 |
-
st.session_state.results = results
|
1039 |
-
st.success(f"Found top {len(results)} candidates!")
|
1040 |
|
1041 |
-
# Display
|
1042 |
if st.session_state.results:
|
1043 |
-
st.header("
|
1044 |
|
1045 |
-
# Create
|
1046 |
-
|
1047 |
for result in st.session_state.results:
|
1048 |
-
|
1049 |
-
"
|
1050 |
-
"
|
1051 |
-
"
|
1052 |
-
"
|
1053 |
-
"
|
1054 |
-
"
|
1055 |
})
|
1056 |
|
1057 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1058 |
|
1059 |
-
|
1060 |
-
st.markdown(
|
1061 |
|
1062 |
-
#
|
1063 |
-
|
1064 |
-
|
1065 |
-
|
|
|
|
|
1066 |
|
1067 |
with col1:
|
1068 |
-
st.
|
1069 |
-
st.
|
1070 |
-
st.
|
1071 |
-
st.write(f"Keyword Score: {result['keyword_score']:.4f}")
|
1072 |
|
1073 |
-
st.
|
1074 |
-
|
1075 |
-
|
1076 |
-
st.write(f"• {skill}")
|
1077 |
-
else:
|
1078 |
-
st.write("No specific skills matched.")
|
1079 |
|
1080 |
with col2:
|
1081 |
-
|
1082 |
-
|
1083 |
-
|
1084 |
-
|
1085 |
-
|
1086 |
-
|
1087 |
-
|
1088 |
-
|
1089 |
-
|
1090 |
-
|
1091 |
-
|
1092 |
-
|
1093 |
-
|
1094 |
-
|
1095 |
-
|
1096 |
-
|
1097 |
-
|
1098 |
-
|
1099 |
-
|
1100 |
-
|
1101 |
-
|
1102 |
-
|
1103 |
-
|
|
|
|
|
|
|
1104 |
|
1105 |
# Footer
|
1106 |
st.markdown("---")
|
1107 |
-
st.markdown(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
|
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
import torch
|
|
|
7 |
import os
|
8 |
import tempfile
|
9 |
import base64
|
10 |
+
import re
|
11 |
+
import io
|
12 |
from rank_bm25 import BM25Okapi
|
13 |
+
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
|
|
14 |
from nltk.tokenize import word_tokenize, sent_tokenize
|
15 |
from tqdm import tqdm
|
16 |
+
import pdfplumber
|
|
|
17 |
import PyPDF2
|
18 |
from docx import Document
|
19 |
import csv
|
20 |
+
from datasets import load_dataset
|
21 |
+
import gc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
# Download NLTK resources
|
24 |
try:
|
|
|
26 |
except LookupError:
|
27 |
nltk.download('punkt')
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
# Set page configuration
|
30 |
st.set_page_config(
|
31 |
+
page_title="AI Resume Screener",
|
32 |
+
page_icon="🎯",
|
33 |
layout="wide",
|
34 |
initial_sidebar_state="expanded"
|
35 |
)
|
36 |
|
37 |
+
# Sidebar configuration
|
38 |
with st.sidebar:
|
39 |
+
st.title("⚙️ Configuration")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
# Ranking weights
|
42 |
st.subheader("Ranking Weights")
|
|
|
46 |
|
47 |
# Advanced options
|
48 |
st.subheader("Advanced Options")
|
49 |
+
top_k = st.number_input("Number of results to display", min_value=1, max_value=50, value=10, step=1)
|
50 |
+
use_explanation = st.checkbox("Generate AI Explanations", value=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
st.markdown("---")
|
53 |
+
st.markdown("### 🤖 Models Used")
|
54 |
+
st.markdown("- **Embedding**: NVIDIA NV-Embed-v2")
|
55 |
+
st.markdown("- **Explanation**: Qwen3-14B (4-bit)")
|
56 |
+
st.markdown("### 📊 About")
|
57 |
+
st.markdown("This app uses hybrid ranking combining semantic similarity with keyword matching to find the best candidates for job positions.")
|
58 |
|
59 |
+
# Initialize session state
|
60 |
+
if 'embedding_model' not in st.session_state:
|
61 |
+
st.session_state.embedding_model = None
|
62 |
+
if 'explanation_model' not in st.session_state:
|
63 |
+
st.session_state.explanation_model = None
|
64 |
if 'results' not in st.session_state:
|
65 |
st.session_state.results = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
+
@st.cache_resource
|
68 |
+
def load_embedding_model():
|
69 |
+
"""Load and cache the embedding model"""
|
70 |
+
try:
|
71 |
+
with st.spinner("🔄 Loading NVIDIA NV-Embed-v2 model..."):
|
72 |
+
tokenizer = AutoTokenizer.from_pretrained("nvidia/NV-Embed-v2", trust_remote_code=True)
|
73 |
+
model = AutoModel.from_pretrained(
|
74 |
+
"nvidia/NV-Embed-v2",
|
75 |
+
trust_remote_code=True,
|
76 |
+
device_map="auto",
|
77 |
+
torch_dtype=torch.float16
|
78 |
+
)
|
79 |
+
st.success("✅ Embedding model loaded successfully!")
|
80 |
+
return model, tokenizer
|
81 |
+
except Exception as e:
|
82 |
+
st.error(f"❌ Error loading embedding model: {str(e)}")
|
83 |
+
return None, None
|
84 |
+
|
85 |
+
@st.cache_resource
|
86 |
+
def load_explanation_model():
|
87 |
+
"""Load and cache the explanation model with quantization"""
|
88 |
+
if not use_explanation:
|
89 |
+
return None, None
|
90 |
|
91 |
+
try:
|
92 |
+
with st.spinner("🔄 Loading Qwen3-14B model with 4-bit quantization..."):
|
93 |
+
# Configure 4-bit quantization
|
94 |
+
quantization_config = BitsAndBytesConfig(
|
95 |
+
load_in_4bit=True,
|
96 |
+
bnb_4bit_quant_type="nf4",
|
97 |
+
bnb_4bit_compute_dtype=torch.float16,
|
98 |
+
bnb_4bit_use_double_quant=True
|
99 |
+
)
|
100 |
+
|
101 |
+
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-14B-Instruct", trust_remote_code=True)
|
102 |
+
model = AutoModelForCausalLM.from_pretrained(
|
103 |
+
"Qwen/Qwen2.5-14B-Instruct",
|
104 |
+
quantization_config=quantization_config,
|
105 |
+
device_map="auto",
|
106 |
+
trust_remote_code=True,
|
107 |
+
torch_dtype=torch.float16
|
108 |
+
)
|
109 |
+
st.success("✅ Explanation model loaded successfully!")
|
110 |
+
return model, tokenizer
|
111 |
+
except Exception as e:
|
112 |
+
st.error(f"❌ Error loading explanation model: {str(e)}")
|
113 |
+
return None, None
|
114 |
+
|
115 |
+
class ResumeScreener:
|
116 |
+
def __init__(self):
|
117 |
+
# Load models
|
118 |
+
self.embedding_model, self.embedding_tokenizer = load_embedding_model()
|
119 |
+
if use_explanation:
|
120 |
+
self.explanation_model, self.explanation_tokenizer = load_explanation_model()
|
121 |
+
else:
|
122 |
+
self.explanation_model, self.explanation_tokenizer = None, None
|
123 |
|
124 |
+
def extract_text_from_file(self, file_path, file_type):
|
125 |
"""Extract text from various file types"""
|
126 |
try:
|
127 |
if file_type == "pdf":
|
128 |
+
with open(file_path, 'rb') as file:
|
129 |
+
with pdfplumber.open(file) as pdf:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
text = ""
|
131 |
+
for page in pdf.pages:
|
|
|
132 |
text += page.extract_text() or ""
|
133 |
+
|
134 |
+
if not text.strip():
|
135 |
+
# Fallback to PyPDF2
|
136 |
+
file.seek(0)
|
137 |
+
reader = PyPDF2.PdfReader(file)
|
138 |
+
text = ""
|
139 |
+
for page in reader.pages:
|
140 |
+
text += page.extract_text() or ""
|
141 |
+
return text
|
142 |
+
|
143 |
elif file_type == "docx":
|
144 |
+
doc = Document(file_path)
|
145 |
return " ".join([paragraph.text for paragraph in doc.paragraphs])
|
146 |
|
147 |
elif file_type == "txt":
|
148 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
149 |
+
return file.read()
|
150 |
+
|
151 |
elif file_type == "csv":
|
152 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
153 |
+
csv_reader = csv.reader(file)
|
154 |
+
return " ".join([" ".join(row) for row in csv_reader])
|
155 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
except Exception as e:
|
157 |
+
st.error(f"Error extracting text from {file_path}: {str(e)}")
|
158 |
return ""
|
159 |
|
160 |
def get_embedding(self, text):
|
161 |
+
"""Generate embedding for text"""
|
162 |
+
if self.embedding_model is None:
|
163 |
+
return np.zeros(4096) # NV-Embed-v2 dimension
|
|
|
164 |
|
165 |
try:
|
166 |
+
# Truncate text to avoid memory issues
|
167 |
+
text = text[:8192] # Reasonable limit for NV-Embed-v2
|
168 |
+
|
169 |
+
inputs = self.embedding_tokenizer(
|
170 |
+
text,
|
171 |
+
return_tensors="pt",
|
172 |
+
truncation=True,
|
173 |
+
max_length=512,
|
174 |
+
padding=True
|
175 |
+
)
|
176 |
|
177 |
+
# Move to same device as model
|
178 |
+
device = next(self.embedding_model.parameters()).device
|
179 |
inputs = {k: v.to(device) for k, v in inputs.items()}
|
180 |
|
181 |
with torch.no_grad():
|
182 |
+
outputs = self.embedding_model(**inputs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
|
184 |
+
# Extract embeddings - NV-Embed-v2 specific
|
185 |
+
if hasattr(outputs, 'pooler_output'):
|
186 |
+
embeddings = outputs.pooler_output
|
187 |
+
elif hasattr(outputs, 'last_hidden_state'):
|
188 |
+
embeddings = outputs.last_hidden_state.mean(dim=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
else:
|
190 |
+
embeddings = outputs[0].mean(dim=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
|
192 |
+
return embeddings.cpu().numpy().squeeze()
|
193 |
+
|
|
|
|
|
194 |
except Exception as e:
|
195 |
st.error(f"Error generating embedding: {str(e)}")
|
196 |
+
return np.zeros(4096)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
|
198 |
def calculate_bm25_scores(self, resume_texts, job_description):
|
199 |
"""Calculate BM25 scores for keyword matching"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
try:
|
201 |
+
job_tokens = word_tokenize(job_description.lower())
|
202 |
+
corpus = [word_tokenize(text.lower()) for text in resume_texts if text.strip()]
|
203 |
+
|
204 |
+
if not corpus:
|
205 |
+
return [0.0] * len(resume_texts)
|
206 |
+
|
207 |
+
bm25 = BM25Okapi(corpus)
|
208 |
+
scores = bm25.get_scores(job_tokens)
|
209 |
+
return scores.tolist()
|
210 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
except Exception as e:
|
212 |
+
st.error(f"Error calculating BM25 scores: {str(e)}")
|
213 |
return [0.0] * len(resume_texts)
|
214 |
|
215 |
+
def calculate_hybrid_scores(self, resume_texts, job_description):
|
216 |
+
"""Calculate hybrid scores combining semantic and keyword matching"""
|
217 |
+
# Get job embedding
|
218 |
+
job_embedding = self.get_embedding(job_description)
|
219 |
+
|
220 |
+
# Get resume embeddings
|
221 |
+
resume_embeddings = []
|
222 |
+
progress_bar = st.progress(0)
|
223 |
+
for i, text in enumerate(resume_texts):
|
224 |
+
embedding = self.get_embedding(text)
|
225 |
+
resume_embeddings.append(embedding)
|
226 |
+
progress_bar.progress((i + 1) / len(resume_texts))
|
227 |
+
|
228 |
+
# Calculate semantic scores (cosine similarity)
|
229 |
+
semantic_scores = []
|
230 |
+
for resume_emb in resume_embeddings:
|
231 |
+
job_norm = job_embedding / (np.linalg.norm(job_embedding) + 1e-8)
|
232 |
+
resume_norm = resume_emb / (np.linalg.norm(resume_emb) + 1e-8)
|
233 |
+
similarity = np.dot(job_norm, resume_norm)
|
234 |
+
semantic_scores.append(float(similarity))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
235 |
|
236 |
# Calculate BM25 scores
|
237 |
bm25_scores = self.calculate_bm25_scores(resume_texts, job_description)
|
238 |
|
239 |
# Normalize BM25 scores
|
240 |
+
if bm25_scores and max(bm25_scores) > 0:
|
241 |
+
max_bm25 = max(bm25_scores)
|
242 |
+
bm25_scores = [score / max_bm25 for score in bm25_scores]
|
243 |
|
244 |
# Calculate hybrid scores
|
|
|
245 |
hybrid_scores = [
|
246 |
(semantic_weight * sem_score) + (keyword_weight * bm25_score)
|
247 |
for sem_score, bm25_score in zip(semantic_scores, bm25_scores)
|
|
|
250 |
return hybrid_scores, semantic_scores, bm25_scores
|
251 |
|
252 |
def extract_skills(self, text, job_description):
|
253 |
+
"""Extract skills from resume based on job description"""
|
254 |
+
# Common tech skills and job-related terms
|
255 |
+
common_skills = [
|
256 |
+
"python", "java", "javascript", "react", "node.js", "sql", "html", "css",
|
257 |
+
"aws", "azure", "docker", "kubernetes", "git", "agile", "scrum", "ci/cd",
|
258 |
+
"machine learning", "data science", "artificial intelligence", "tensorflow",
|
259 |
+
"pytorch", "pandas", "numpy", "scikit-learn", "mysql", "postgresql",
|
260 |
+
"mongodb", "redis", "elasticsearch", "spark", "hadoop", "tableau", "powerbi"
|
261 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
262 |
|
263 |
+
# Extract skills from job description
|
264 |
+
job_words = set(word.lower() for word in word_tokenize(job_description) if len(word) > 2)
|
|
|
|
|
|
|
|
|
|
|
265 |
|
266 |
+
# Find matching skills
|
267 |
+
found_skills = []
|
268 |
+
text_lower = text.lower()
|
|
|
269 |
|
270 |
+
# Check common skills
|
271 |
+
for skill in common_skills:
|
272 |
+
if skill in text_lower and skill in " ".join(job_words):
|
273 |
+
found_skills.append(skill)
|
274 |
|
275 |
+
# Check job-specific terms
|
276 |
+
for word in job_words:
|
277 |
+
if len(word) > 3 and word in text_lower:
|
278 |
+
found_skills.append(word)
|
|
|
|
|
|
|
|
|
279 |
|
280 |
+
return list(set(found_skills))[:10] # Return top 10 unique skills
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
281 |
|
282 |
def generate_explanation(self, resume_text, job_description, score, semantic_score, bm25_score, skills):
|
283 |
+
"""Generate explanation using Qwen model"""
|
284 |
+
if self.explanation_model is None or self.explanation_tokenizer is None:
|
285 |
+
return self._generate_simple_explanation(score, semantic_score, bm25_score, skills)
|
286 |
+
|
287 |
+
try:
|
288 |
+
# Create prompt
|
289 |
+
prompt = f"""As a recruitment AI assistant, explain why this resume scored {score:.2f} for the given job position.
|
290 |
+
|
291 |
+
Job Requirements:
|
292 |
+
{job_description[:500]}...
|
293 |
+
|
294 |
+
Resume Summary:
|
295 |
+
{resume_text[:800]}...
|
296 |
+
|
297 |
+
Scores:
|
298 |
+
- Overall: {score:.2f}/1.0
|
299 |
+
- Semantic Match: {semantic_score:.2f}/1.0
|
300 |
+
- Keyword Match: {bm25_score:.2f}/1.0
|
301 |
+
- Key Skills: {', '.join(skills[:5])}
|
302 |
+
|
303 |
+
Provide a concise 2-3 sentence explanation of the match quality and key strengths."""
|
304 |
+
|
305 |
+
# Generate response
|
306 |
+
messages = [{"role": "user", "content": prompt}]
|
307 |
+
text = self.explanation_tokenizer.apply_chat_template(
|
308 |
+
messages, tokenize=False, add_generation_prompt=True
|
309 |
)
|
|
|
|
|
|
|
310 |
|
311 |
+
inputs = self.explanation_tokenizer(text, return_tensors="pt").to(self.explanation_model.device)
|
312 |
|
313 |
+
with torch.no_grad():
|
314 |
+
outputs = self.explanation_model.generate(
|
315 |
+
**inputs,
|
316 |
+
max_new_tokens=150,
|
317 |
+
temperature=0.7,
|
318 |
+
do_sample=True,
|
319 |
+
pad_token_id=self.explanation_tokenizer.eos_token_id
|
320 |
+
)
|
321 |
+
|
322 |
+
response = self.explanation_tokenizer.decode(
|
323 |
+
outputs[0][inputs.input_ids.shape[1]:],
|
324 |
+
skip_special_tokens=True
|
325 |
+
)
|
326 |
|
327 |
+
return response.strip()[:400] # Limit length
|
|
|
328 |
|
329 |
+
except Exception as e:
|
330 |
+
st.warning(f"AI explanation failed: {str(e)}")
|
331 |
+
return self._generate_simple_explanation(score, semantic_score, bm25_score, skills)
|
332 |
+
|
333 |
+
def _generate_simple_explanation(self, score, semantic_score, bm25_score, skills):
|
334 |
+
"""Fallback explanation generation"""
|
335 |
+
if score > 0.8:
|
336 |
+
quality = "excellent"
|
337 |
+
elif score > 0.6:
|
338 |
+
quality = "good"
|
339 |
+
elif score > 0.4:
|
340 |
+
quality = "moderate"
|
341 |
+
else:
|
342 |
+
quality = "limited"
|
343 |
+
|
344 |
+
explanation = f"This resume shows {quality} alignment with the job requirements (score: {score:.2f}). "
|
345 |
+
|
346 |
+
if semantic_score > bm25_score:
|
347 |
+
explanation += f"Strong conceptual match ({semantic_score:.2f}) with relevant experience. "
|
348 |
+
else:
|
349 |
+
explanation += f"Good keyword coverage ({bm25_score:.2f}) of job requirements. "
|
350 |
+
|
351 |
+
if skills:
|
352 |
+
explanation += f"Key matching skills: {', '.join(skills[:3])}."
|
353 |
+
|
354 |
+
return explanation
|
355 |
|
356 |
+
def create_download_link(df, filename="resume_screening_results.csv"):
|
357 |
+
"""Create download link for results"""
|
358 |
csv = df.to_csv(index=False)
|
359 |
b64 = base64.b64encode(csv.encode()).decode()
|
360 |
+
return f'<a href="data:file/csv;base64,{b64}" download="{filename}" class="download-btn">📥 Download Results CSV</a>'
|
|
|
361 |
|
362 |
+
# Main App Interface
|
363 |
+
st.title("🎯 AI-Powered Resume Screener")
|
364 |
+
st.markdown("*Find the perfect candidates using advanced AI matching*")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
365 |
st.markdown("---")
|
366 |
|
367 |
+
# Initialize screener
|
368 |
+
if st.session_state.embedding_model is None:
|
369 |
+
screener = ResumeScreener()
|
370 |
+
st.session_state.embedding_model = screener.embedding_model
|
371 |
+
st.session_state.explanation_model = screener.explanation_model
|
372 |
+
else:
|
373 |
+
screener = ResumeScreener()
|
374 |
|
375 |
+
# Job Description Input
|
376 |
+
st.header("📝 Step 1: Enter Job Description")
|
377 |
job_description = st.text_area(
|
378 |
+
"Enter the complete job description or requirements:",
|
379 |
+
height=150,
|
380 |
+
placeholder="Paste the job description here, including required skills, experience, and qualifications..."
|
381 |
)
|
382 |
|
383 |
+
# Resume Input Options
|
384 |
+
st.header("📄 Step 2: Upload Resumes")
|
385 |
+
|
386 |
+
input_method = st.radio(
|
387 |
+
"Choose input method:",
|
388 |
+
["📁 Upload Files", "🗂️ Load from CSV Dataset", "🔗 Load from Hugging Face Dataset"]
|
389 |
)
|
390 |
|
|
|
391 |
resume_texts = []
|
392 |
file_names = []
|
393 |
|
394 |
+
if input_method == "📁 Upload Files":
|
395 |
uploaded_files = st.file_uploader(
|
396 |
"Upload resume files",
|
397 |
+
type=["pdf", "docx", "txt"],
|
398 |
accept_multiple_files=True,
|
399 |
+
help="Supported formats: PDF, DOCX, TXT"
|
400 |
)
|
401 |
|
402 |
if uploaded_files:
|
403 |
+
with st.spinner(f"🔄 Processing {len(uploaded_files)} files..."):
|
404 |
for file in uploaded_files:
|
405 |
file_type = file.name.split('.')[-1].lower()
|
406 |
|
407 |
+
# Save temporary file
|
408 |
with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{file_type}') as tmp_file:
|
409 |
tmp_file.write(file.getvalue())
|
410 |
tmp_path = tmp_file.name
|
411 |
|
412 |
+
# Extract text
|
413 |
text = screener.extract_text_from_file(tmp_path, file_type)
|
414 |
+
if text.strip():
|
415 |
resume_texts.append(text)
|
416 |
file_names.append(file.name)
|
417 |
|
418 |
+
# Cleanup
|
419 |
os.unlink(tmp_path)
|
420 |
+
|
421 |
+
if resume_texts:
|
422 |
+
st.success(f"✅ Successfully processed {len(resume_texts)} resumes")
|
423 |
+
|
424 |
+
elif input_method == "🗂️ Load from CSV Dataset":
|
425 |
+
csv_file = st.file_uploader("Upload CSV file with resume data", type=["csv"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
426 |
|
427 |
+
if csv_file:
|
428 |
+
try:
|
429 |
+
df = pd.read_csv(csv_file)
|
430 |
+
st.write("**CSV Preview:**")
|
431 |
+
st.dataframe(df.head())
|
|
|
|
|
|
|
|
|
|
|
432 |
|
433 |
+
text_column = st.selectbox(
|
434 |
+
"Select column containing resume text:",
|
435 |
+
df.columns.tolist()
|
436 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
437 |
|
438 |
+
name_column = st.selectbox(
|
439 |
+
"Select column for candidate names/IDs (optional):",
|
440 |
+
["Use Index"] + df.columns.tolist()
|
441 |
+
)
|
442 |
+
|
443 |
+
if st.button("🚀 Process CSV Data"):
|
444 |
+
with st.spinner("🔄 Processing CSV data..."):
|
445 |
+
for idx, row in df.iterrows():
|
446 |
+
text = str(row[text_column])
|
447 |
+
if text and text.strip() and text.lower() != 'nan':
|
448 |
+
resume_texts.append(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
449 |
|
450 |
+
if name_column == "Use Index":
|
451 |
+
file_names.append(f"Resume_{idx}")
|
452 |
+
else:
|
453 |
+
file_names.append(str(row[name_column]))
|
|
|
454 |
|
455 |
+
if resume_texts:
|
456 |
+
st.success(f"✅ Successfully loaded {len(resume_texts)} resumes from CSV")
|
|
|
457 |
|
458 |
+
except Exception as e:
|
459 |
+
st.error(f"❌ Error processing CSV: {str(e)}")
|
460 |
+
|
461 |
+
elif input_method == "🔗 Load from Hugging Face Dataset":
|
462 |
+
st.markdown("**Quick Load:** [Resume Atlas Dataset](https://huggingface.co/datasets/ahmedheakl/resume-atlas)")
|
463 |
+
|
464 |
+
col1, col2 = st.columns([2, 1])
|
465 |
+
with col1:
|
466 |
+
dataset_name = st.text_input(
|
467 |
+
"Dataset name:",
|
468 |
+
value="ahmedheakl/resume-atlas",
|
469 |
+
help="Enter Hugging Face dataset name"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
470 |
)
|
471 |
+
with col2:
|
472 |
+
dataset_split = st.selectbox("Split:", ["train", "test", "validation"], index=0)
|
473 |
+
|
474 |
+
if st.button("🔗 Load from Hugging Face"):
|
475 |
+
try:
|
476 |
+
with st.spinner(f"🔄 Loading {dataset_name}..."):
|
477 |
+
dataset = load_dataset(dataset_name, split=dataset_split)
|
478 |
|
479 |
+
st.success(f"✅ Loaded dataset with {len(dataset)} entries")
|
480 |
+
st.write("**Dataset Preview:**")
|
481 |
+
|
482 |
+
# Show first few examples
|
483 |
+
preview_df = pd.DataFrame(dataset[:5])
|
484 |
+
st.dataframe(preview_df)
|
485 |
+
|
486 |
+
# Column selection
|
487 |
+
text_column = st.selectbox(
|
488 |
+
"Select column with resume text:",
|
489 |
+
dataset.column_names,
|
490 |
+
index=0 if 'resume_text' in dataset.column_names else 0
|
491 |
+
)
|
492 |
+
|
493 |
+
category_column = None
|
494 |
+
if 'category' in dataset.column_names:
|
495 |
+
category_column = st.selectbox(
|
496 |
+
"Filter by category (optional):",
|
497 |
+
["All"] + list(set(dataset['category']))
|
498 |
)
|
499 |
+
|
500 |
+
max_samples = st.slider("Maximum samples to load:", 10, min(1000, len(dataset)), 100)
|
501 |
+
|
502 |
+
if st.button("🚀 Process Dataset"):
|
503 |
+
with st.spinner("🔄 Processing dataset..."):
|
504 |
+
filtered_dataset = dataset
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
505 |
|
506 |
+
# Apply category filter
|
507 |
+
if category_column and category_column != "All":
|
508 |
+
filtered_dataset = dataset.filter(lambda x: x['category'] == category_column)
|
509 |
|
510 |
+
# Limit samples
|
511 |
+
sample_indices = list(range(min(max_samples, len(filtered_dataset))))
|
512 |
|
513 |
+
for idx in sample_indices:
|
514 |
+
item = filtered_dataset[idx]
|
515 |
+
text = str(item[text_column])
|
|
|
|
|
|
|
516 |
|
517 |
+
if text and text.strip() and text.lower() != 'nan':
|
518 |
+
resume_texts.append(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
519 |
|
520 |
+
# Use ID or index for naming
|
521 |
+
if 'id' in item:
|
522 |
+
file_names.append(f"Resume_{item['id']}")
|
523 |
+
else:
|
524 |
+
file_names.append(f"Resume_{idx}")
|
525 |
+
|
526 |
+
if resume_texts:
|
527 |
+
st.success(f"✅ Successfully loaded {len(resume_texts)} resumes")
|
528 |
+
|
529 |
+
except Exception as e:
|
530 |
+
st.error(f"❌ Error loading dataset: {str(e)}")
|
531 |
|
532 |
+
# Processing and Results
|
533 |
+
if st.button("🔍 Find Best Candidates", disabled=not (job_description and resume_texts)):
|
534 |
+
if len(resume_texts) == 0:
|
535 |
+
st.error("❌ Please upload resumes first!")
|
536 |
+
elif not job_description.strip():
|
537 |
+
st.error("❌ Please enter a job description!")
|
538 |
+
else:
|
539 |
+
with st.spinner("🧠 AI is analyzing resumes..."):
|
540 |
+
# Calculate scores
|
541 |
+
hybrid_scores, semantic_scores, bm25_scores = screener.calculate_hybrid_scores(
|
542 |
+
resume_texts, job_description
|
543 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
544 |
|
545 |
+
# Prepare results
|
546 |
+
results = []
|
547 |
+
for i, (name, text, hybrid_score, semantic_score, bm25_score) in enumerate(
|
548 |
+
zip(file_names, resume_texts, hybrid_scores, semantic_scores, bm25_scores)
|
549 |
+
):
|
550 |
+
# Extract skills
|
551 |
+
skills = screener.extract_skills(text, job_description)
|
552 |
+
|
553 |
+
# Generate explanation
|
554 |
+
explanation = ""
|
555 |
+
if use_explanation:
|
556 |
+
explanation = screener.generate_explanation(
|
557 |
+
text, job_description, hybrid_score, semantic_score, bm25_score, skills
|
558 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
559 |
|
560 |
+
results.append({
|
561 |
+
'rank': i + 1,
|
562 |
+
'name': name,
|
563 |
+
'score': hybrid_score,
|
564 |
+
'semantic_score': semantic_score,
|
565 |
+
'keyword_score': bm25_score,
|
566 |
+
'skills': skills,
|
567 |
+
'explanation': explanation,
|
568 |
+
'text_preview': text[:300] + "..." if len(text) > 300 else text
|
569 |
+
})
|
570 |
|
571 |
+
# Sort by score
|
572 |
+
results.sort(key=lambda x: x['score'], reverse=True)
|
|
|
573 |
|
574 |
+
# Update ranks
|
575 |
+
for i, result in enumerate(results):
|
576 |
+
result['rank'] = i + 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
577 |
|
578 |
+
# Store in session state
|
579 |
+
st.session_state.results = results[:top_k]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
580 |
|
581 |
+
st.success(f"🎉 Analysis complete! Found top {len(st.session_state.results)} candidates")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
582 |
|
583 |
+
# Display Results
|
584 |
if st.session_state.results:
|
585 |
+
st.header("🏆 Top Candidates")
|
586 |
|
587 |
+
# Create summary dataframe
|
588 |
+
summary_data = []
|
589 |
for result in st.session_state.results:
|
590 |
+
summary_data.append({
|
591 |
+
"Rank": result['rank'],
|
592 |
+
"Candidate": result['name'],
|
593 |
+
"Overall Score": f"{result['score']:.3f}",
|
594 |
+
"Semantic Score": f"{result['semantic_score']:.3f}",
|
595 |
+
"Keyword Score": f"{result['keyword_score']:.3f}",
|
596 |
+
"Key Skills": ", ".join(result['skills'][:3]) + ("..." if len(result['skills']) > 3 else ""),
|
597 |
})
|
598 |
|
599 |
+
summary_df = pd.DataFrame(summary_data)
|
600 |
+
st.dataframe(summary_df, use_container_width=True)
|
601 |
+
|
602 |
+
# Download link
|
603 |
+
detailed_data = []
|
604 |
+
for result in st.session_state.results:
|
605 |
+
detailed_data.append({
|
606 |
+
"Rank": result['rank'],
|
607 |
+
"Candidate": result['name'],
|
608 |
+
"Overall_Score": result['score'],
|
609 |
+
"Semantic_Score": result['semantic_score'],
|
610 |
+
"Keyword_Score": result['keyword_score'],
|
611 |
+
"Skills": "; ".join(result['skills']),
|
612 |
+
"Explanation": result['explanation'],
|
613 |
+
"Resume_Preview": result['text_preview']
|
614 |
+
})
|
615 |
|
616 |
+
download_df = pd.DataFrame(detailed_data)
|
617 |
+
st.markdown(create_download_link(download_df), unsafe_allow_html=True)
|
618 |
|
619 |
+
# Detailed results
|
620 |
+
st.subheader("📋 Detailed Analysis")
|
621 |
+
|
622 |
+
for result in st.session_state.results:
|
623 |
+
with st.expander(f"🥇 #{result['rank']}: {result['name']} (Score: {result['score']:.3f})"):
|
624 |
+
col1, col2 = st.columns([1, 2])
|
625 |
|
626 |
with col1:
|
627 |
+
st.metric("Overall Score", f"{result['score']:.3f}")
|
628 |
+
st.metric("Semantic Match", f"{result['semantic_score']:.3f}")
|
629 |
+
st.metric("Keyword Match", f"{result['keyword_score']:.3f}")
|
|
|
630 |
|
631 |
+
st.write("**🎯 Key Skills:**")
|
632 |
+
for skill in result['skills'][:8]:
|
633 |
+
st.write(f"• {skill}")
|
|
|
|
|
|
|
634 |
|
635 |
with col2:
|
636 |
+
if result['explanation']:
|
637 |
+
st.write("**🤖 AI Analysis:**")
|
638 |
+
st.info(result['explanation'])
|
639 |
+
|
640 |
+
st.write("**📄 Resume Preview:**")
|
641 |
+
st.text_area("", result['text_preview'], height=150, disabled=True, key=f"preview_{result['rank']}")
|
642 |
+
|
643 |
+
# Score visualization
|
644 |
+
if len(st.session_state.results) > 1:
|
645 |
+
st.subheader("📊 Score Visualization")
|
646 |
+
|
647 |
+
chart_data = pd.DataFrame({
|
648 |
+
'Candidate': [r['name'] for r in st.session_state.results],
|
649 |
+
'Overall Score': [r['score'] for r in st.session_state.results],
|
650 |
+
'Semantic Score': [r['semantic_score'] for r in st.session_state.results],
|
651 |
+
'Keyword Score': [r['keyword_score'] for r in st.session_state.results]
|
652 |
+
})
|
653 |
+
|
654 |
+
st.bar_chart(chart_data.set_index('Candidate'))
|
655 |
+
|
656 |
+
# Memory cleanup
|
657 |
+
if st.button("🧹 Clear Memory"):
|
658 |
+
if torch.cuda.is_available():
|
659 |
+
torch.cuda.empty_cache()
|
660 |
+
gc.collect()
|
661 |
+
st.success("✅ Memory cleared!")
|
662 |
|
663 |
# Footer
|
664 |
st.markdown("---")
|
665 |
+
st.markdown(
|
666 |
+
"""
|
667 |
+
<div style='text-align: center; color: #666;'>
|
668 |
+
🚀 Powered by NVIDIA NV-Embed-v2 & Qwen3-14B | Built with Streamlit
|
669 |
+
</div>
|
670 |
+
""",
|
671 |
+
unsafe_allow_html=True
|
672 |
+
)
|
explanation_generator.py
DELETED
@@ -1,223 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Explanation Generator Module
|
3 |
-
|
4 |
-
This module handles the generation of explanations for resume rankings
|
5 |
-
using the Qwen3-14B model from Hugging Face.
|
6 |
-
"""
|
7 |
-
|
8 |
-
import torch
|
9 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
10 |
-
import os
|
11 |
-
import re
|
12 |
-
import sys
|
13 |
-
|
14 |
-
# Use the alternative model loading approach
|
15 |
-
try:
|
16 |
-
# Try to import the functions from alt_models.py
|
17 |
-
from alt_models import load_explanation_model
|
18 |
-
USE_ALT_MODELS = True
|
19 |
-
except ImportError:
|
20 |
-
USE_ALT_MODELS = False
|
21 |
-
# If import fails, we'll use the original approach
|
22 |
-
# Add Replicate class workaround if not already defined
|
23 |
-
try:
|
24 |
-
from transformers.models.qwen2.modeling_qwen2 import Replicate
|
25 |
-
except (ImportError, AttributeError):
|
26 |
-
class Replicate(torch.nn.Module):
|
27 |
-
"""Workaround class for missing Replicate in Qwen models"""
|
28 |
-
def __init__(self, module, num_replicas=1):
|
29 |
-
super().__init__()
|
30 |
-
self.module = module
|
31 |
-
self.num_replicas = num_replicas
|
32 |
-
|
33 |
-
def forward(self, *args, **kwargs):
|
34 |
-
return self.module(*args, **kwargs)
|
35 |
-
|
36 |
-
# Create module structure if it doesn't exist yet
|
37 |
-
parent_modules = [
|
38 |
-
"transformers.models",
|
39 |
-
"transformers.models.qwen2",
|
40 |
-
]
|
41 |
-
|
42 |
-
# Create all parent modules
|
43 |
-
for module_path in parent_modules:
|
44 |
-
if module_path not in sys.modules:
|
45 |
-
sys.modules[module_path] = type('', (), {})
|
46 |
-
|
47 |
-
# Create and add the Replicate class
|
48 |
-
if "transformers.models.qwen2.modeling_qwen2" not in sys.modules:
|
49 |
-
sys.modules["transformers.models.qwen2.modeling_qwen2"] = type('', (), {})
|
50 |
-
sys.modules["transformers.models.qwen2.modeling_qwen2"].Replicate = Replicate
|
51 |
-
|
52 |
-
# Load Qwen3 model at initialization time
|
53 |
-
print("Loading Qwen/Qwen3-14B model with 4-bit quantization...")
|
54 |
-
QWEN_MODEL_NAME = "Qwen/Qwen3-14B"
|
55 |
-
|
56 |
-
if USE_ALT_MODELS:
|
57 |
-
# Use the alternative loading approach
|
58 |
-
global_qwen_model, global_qwen_tokenizer = load_explanation_model(QWEN_MODEL_NAME)
|
59 |
-
else:
|
60 |
-
# Use original approach
|
61 |
-
try:
|
62 |
-
# Configure 4-bit quantization for better performance
|
63 |
-
quantization_config = BitsAndBytesConfig(
|
64 |
-
load_in_4bit=True,
|
65 |
-
bnb_4bit_quant_type="nf4",
|
66 |
-
bnb_4bit_compute_dtype=torch.float16,
|
67 |
-
bnb_4bit_use_double_quant=True
|
68 |
-
)
|
69 |
-
|
70 |
-
# Load Qwen3 model and tokenizer
|
71 |
-
global_qwen_tokenizer = AutoTokenizer.from_pretrained(QWEN_MODEL_NAME, trust_remote_code=True)
|
72 |
-
global_qwen_model = None
|
73 |
-
|
74 |
-
# Check if we have enough resources to load the model
|
75 |
-
if torch.cuda.is_available():
|
76 |
-
gpu_memory = torch.cuda.get_device_properties(0).total_memory
|
77 |
-
if gpu_memory >= 12 * (1024**3): # 12 GB (reduced memory requirement compared to 32B model)
|
78 |
-
global_qwen_model = AutoModelForCausalLM.from_pretrained(
|
79 |
-
QWEN_MODEL_NAME,
|
80 |
-
quantization_config=quantization_config,
|
81 |
-
device_map="auto",
|
82 |
-
trust_remote_code=True,
|
83 |
-
torch_dtype=torch.float16
|
84 |
-
)
|
85 |
-
print("Successfully loaded Qwen3-14B with 4-bit quantization")
|
86 |
-
else:
|
87 |
-
print("Not enough GPU memory, using template-based explanations")
|
88 |
-
else:
|
89 |
-
print("CUDA not available, using template-based explanations")
|
90 |
-
|
91 |
-
except Exception as e:
|
92 |
-
print(f"Error loading Qwen3-14B model: {str(e)}")
|
93 |
-
print("Falling back to template-based explanations.")
|
94 |
-
global_qwen_tokenizer = None
|
95 |
-
global_qwen_model = None
|
96 |
-
|
97 |
-
class ExplanationGenerator:
|
98 |
-
def __init__(self, model_name="Qwen/Qwen3-14B"):
|
99 |
-
"""Initialize the explanation generator with the specified model"""
|
100 |
-
self.model_name = model_name
|
101 |
-
# Use globally pre-loaded model and tokenizer
|
102 |
-
self.model = global_qwen_model
|
103 |
-
self.tokenizer = global_qwen_tokenizer
|
104 |
-
self.initialized = True
|
105 |
-
|
106 |
-
def generate_explanation(self, resume_text, job_description, score, semantic_score, keyword_score, skills):
|
107 |
-
"""Generate explanation for why a resume was ranked highly"""
|
108 |
-
# Use the model if it's available
|
109 |
-
if self.model is not None and self.tokenizer is not None:
|
110 |
-
try:
|
111 |
-
# Prepare prompt for Qwen3-14B
|
112 |
-
prompt = self._create_prompt(resume_text, job_description, score, semantic_score, keyword_score, skills)
|
113 |
-
|
114 |
-
# Create messages for chat format
|
115 |
-
messages = [
|
116 |
-
{"role": "user", "content": prompt}
|
117 |
-
]
|
118 |
-
|
119 |
-
# Apply chat template with thinking mode enabled
|
120 |
-
text = self.tokenizer.apply_chat_template(
|
121 |
-
messages,
|
122 |
-
tokenize=False,
|
123 |
-
add_generation_prompt=True,
|
124 |
-
enable_thinking=True
|
125 |
-
)
|
126 |
-
|
127 |
-
# Tokenize
|
128 |
-
inputs = self.tokenizer(text, return_tensors="pt").to(self.model.device)
|
129 |
-
|
130 |
-
# Generate response with recommended parameters for thinking mode
|
131 |
-
output_ids = self.model.generate(
|
132 |
-
**inputs,
|
133 |
-
max_new_tokens=500,
|
134 |
-
temperature=0.6,
|
135 |
-
top_p=0.95,
|
136 |
-
top_k=20
|
137 |
-
)
|
138 |
-
|
139 |
-
# Decode the response
|
140 |
-
response = self.tokenizer.decode(output_ids[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
141 |
-
|
142 |
-
# Clean up the response
|
143 |
-
cleaned_response = self._clean_response(response)
|
144 |
-
|
145 |
-
return cleaned_response
|
146 |
-
|
147 |
-
except Exception as e:
|
148 |
-
print(f"Error generating explanation with Qwen3-14B: {str(e)}")
|
149 |
-
# Fall back to template-based explanation
|
150 |
-
return self._generate_template_explanation(score, semantic_score, keyword_score, skills)
|
151 |
-
else:
|
152 |
-
# Use template-based explanation if model is not available
|
153 |
-
return self._generate_template_explanation(score, semantic_score, keyword_score, skills)
|
154 |
-
|
155 |
-
def _create_prompt(self, resume_text, job_description, score, semantic_score, keyword_score, skills):
|
156 |
-
"""Create a prompt for the explanation generation"""
|
157 |
-
# Use only the first 1000 characters of the resume to keep prompt size manageable
|
158 |
-
resume_excerpt = resume_text[:1000] + "..." if len(resume_text) > 1000 else resume_text
|
159 |
-
|
160 |
-
prompt = f"""You are an AI assistant helping a recruiter understand why a candidate's resume was matched with a job posting.
|
161 |
-
|
162 |
-
The resume has been assigned the following scores:
|
163 |
-
- Overall Match Score: {score:.2f} out of 1.0
|
164 |
-
- Semantic Relevance Score: {semantic_score:.2f} out of 1.0
|
165 |
-
- Keyword Match Score: {keyword_score:.2f} out of 1.0
|
166 |
-
|
167 |
-
The job description is:
|
168 |
-
```
|
169 |
-
{job_description}
|
170 |
-
```
|
171 |
-
|
172 |
-
Based on analysis, the resume contains these skills relevant to the job: {', '.join(skills)}
|
173 |
-
|
174 |
-
Resume excerpt:
|
175 |
-
```
|
176 |
-
{resume_excerpt}
|
177 |
-
```
|
178 |
-
|
179 |
-
Please provide a short explanation (3-5 sentences) of why this resume received these scores and how well it matches the job requirements. Focus on the relationship between the candidate's experience and the job requirements."""
|
180 |
-
|
181 |
-
return prompt
|
182 |
-
|
183 |
-
def _clean_response(self, response):
|
184 |
-
"""Clean the response from the model"""
|
185 |
-
# Remove any thinking or internal processing tokens
|
186 |
-
response = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL)
|
187 |
-
|
188 |
-
# Limit to a reasonable length
|
189 |
-
if len(response) > 500:
|
190 |
-
sentences = response.split('.')
|
191 |
-
shortened = '.'.join(sentences[:5]) + '.'
|
192 |
-
return shortened
|
193 |
-
|
194 |
-
return response
|
195 |
-
|
196 |
-
def _generate_template_explanation(self, score, semantic_score, keyword_score, skills):
|
197 |
-
"""Generate a template-based explanation when the model is not available"""
|
198 |
-
# Simple template-based explanation
|
199 |
-
if score > 0.8:
|
200 |
-
quality = "excellent"
|
201 |
-
elif score > 0.6:
|
202 |
-
quality = "good"
|
203 |
-
elif score > 0.4:
|
204 |
-
quality = "moderate"
|
205 |
-
else:
|
206 |
-
quality = "limited"
|
207 |
-
|
208 |
-
explanation = f"This resume shows {quality} alignment with the job requirements, with an overall score of {score:.2f}. "
|
209 |
-
|
210 |
-
if semantic_score > keyword_score:
|
211 |
-
explanation += f"The candidate's experience demonstrates strong semantic relevance ({semantic_score:.2f}) to the position, though specific keyword matches ({keyword_score:.2f}) could be improved. "
|
212 |
-
else:
|
213 |
-
explanation += f"The resume contains many relevant keywords ({keyword_score:.2f}), but could benefit from better contextual alignment ({semantic_score:.2f}) with the job requirements. "
|
214 |
-
|
215 |
-
if skills:
|
216 |
-
if len(skills) > 3:
|
217 |
-
explanation += f"Key skills identified include {', '.join(skills[:3])}, and {len(skills)-3} others that match the job requirements."
|
218 |
-
else:
|
219 |
-
explanation += f"Key skills identified include {', '.join(skills)}."
|
220 |
-
else:
|
221 |
-
explanation += "No specific skills were identified that directly match the requirements."
|
222 |
-
|
223 |
-
return explanation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fix_dependencies.py
DELETED
@@ -1,76 +0,0 @@
|
|
1 |
-
#!/usr/bin/env python
|
2 |
-
"""
|
3 |
-
Dependency fixer for Resume Screener and Skill Extractor
|
4 |
-
This script ensures all dependencies are properly installed with compatible versions.
|
5 |
-
"""
|
6 |
-
|
7 |
-
import sys
|
8 |
-
import subprocess
|
9 |
-
import pkg_resources
|
10 |
-
import os
|
11 |
-
|
12 |
-
def install(package):
|
13 |
-
"""Install a package using pip"""
|
14 |
-
subprocess.check_call([sys.executable, "-m", "pip", "install", package])
|
15 |
-
|
16 |
-
def install_with_message(package, message=None):
|
17 |
-
"""Install a package with an optional message"""
|
18 |
-
if message:
|
19 |
-
print(f"\n{message}")
|
20 |
-
print(f"Installing {package}...")
|
21 |
-
install(package)
|
22 |
-
|
23 |
-
def main():
|
24 |
-
print("Running dependency fixer for Resume Screener and Skill Extractor...")
|
25 |
-
|
26 |
-
# Install core dependencies first
|
27 |
-
install_with_message("pip==23.1.2", "Upgrading pip to ensure compatibility")
|
28 |
-
install_with_message("setuptools==68.0.0", "Installing compatible setuptools")
|
29 |
-
|
30 |
-
# Check if we're in a Hugging Face Space
|
31 |
-
in_hf_space = os.environ.get("SPACE_ID") is not None
|
32 |
-
|
33 |
-
# Install key libraries with specific versions to ensure compatibility
|
34 |
-
dependencies = [
|
35 |
-
("streamlit==1.31.0", "Installing Streamlit for the web interface"),
|
36 |
-
("pdfplumber==0.10.1", "Installing PDF processing libraries"),
|
37 |
-
("PyPDF2==3.0.1", None),
|
38 |
-
("python-docx==1.0.1", None),
|
39 |
-
("rank-bm25==0.2.2", "Installing BM25 ranking library"),
|
40 |
-
("tqdm==4.66.1", "Installing progress bar utility"),
|
41 |
-
("faiss-cpu==1.7.4", "Installing FAISS for vector similarity search"),
|
42 |
-
("huggingface-hub==0.20.3", "Installing Hugging Face Hub"),
|
43 |
-
("transformers==4.36.2", "Installing Transformers"),
|
44 |
-
("sentence-transformers==2.2.2", "Installing Sentence Transformers"),
|
45 |
-
("torch==2.1.2", "Installing PyTorch"),
|
46 |
-
("nltk==3.8.1", "Installing NLTK for text processing"),
|
47 |
-
("pandas==2.1.3", "Installing data processing libraries"),
|
48 |
-
("numpy==1.24.3", None),
|
49 |
-
("plotly==5.18.0", "Installing visualization libraries"),
|
50 |
-
("spacy==3.7.2", "Installing spaCy for NLP"),
|
51 |
-
]
|
52 |
-
|
53 |
-
# Install all dependencies
|
54 |
-
for package, message in dependencies:
|
55 |
-
install_with_message(package, message)
|
56 |
-
|
57 |
-
# Download required NLTK data
|
58 |
-
print("\nDownloading NLTK data...")
|
59 |
-
install("nltk")
|
60 |
-
import nltk
|
61 |
-
nltk.download('punkt')
|
62 |
-
|
63 |
-
# Download spaCy model if not in a Hugging Face Space
|
64 |
-
# (Spaces should include this in the requirements.txt)
|
65 |
-
if not in_hf_space:
|
66 |
-
print("\nDownloading spaCy model...")
|
67 |
-
try:
|
68 |
-
subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
|
69 |
-
except:
|
70 |
-
install("https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0.tar.gz")
|
71 |
-
|
72 |
-
print("\nDependency installation complete!")
|
73 |
-
print("You can now run the Resume Screener with: streamlit run app.py")
|
74 |
-
|
75 |
-
if __name__ == "__main__":
|
76 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -1,22 +1,19 @@
|
|
1 |
streamlit==1.31.0
|
|
|
|
|
2 |
pdfplumber==0.10.1
|
3 |
PyPDF2==3.0.1
|
4 |
python-docx==1.0.1
|
5 |
-
spacy==3.7.2
|
6 |
-
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0.tar.gz
|
7 |
-
transformers==4.48.0
|
8 |
-
torch==2.1.2
|
9 |
nltk==3.8.1
|
10 |
faiss-cpu==1.7.4
|
11 |
rank-bm25==0.2.2
|
12 |
-
sentence-transformers==2.7.0
|
13 |
-
plotly==5.18.0
|
14 |
pandas==2.1.3
|
15 |
numpy==1.24.3
|
16 |
tqdm==4.66.1
|
17 |
huggingface-hub==0.27.1
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
|
|
|
1 |
streamlit==1.31.0
|
2 |
+
transformers==4.48.0
|
3 |
+
torch==2.1.2
|
4 |
pdfplumber==0.10.1
|
5 |
PyPDF2==3.0.1
|
6 |
python-docx==1.0.1
|
|
|
|
|
|
|
|
|
7 |
nltk==3.8.1
|
8 |
faiss-cpu==1.7.4
|
9 |
rank-bm25==0.2.2
|
|
|
|
|
10 |
pandas==2.1.3
|
11 |
numpy==1.24.3
|
12 |
tqdm==4.66.1
|
13 |
huggingface-hub==0.27.1
|
14 |
+
bitsandbytes==0.44.1
|
15 |
+
accelerate==0.27.2
|
16 |
+
datasets==2.18.0
|
17 |
+
sentence-transformers==2.7.0
|
18 |
+
plotly==5.18.0
|
19 |
+
base64
|