root commited on
Commit
ba2dfe6
·
1 Parent(s): 53cdf96
Files changed (3) hide show
  1. app.py +114 -13
  2. explanation_generator.py +106 -49
  3. requirements.txt +2 -0
app.py CHANGED
@@ -34,6 +34,38 @@ st.set_page_config(
34
  initial_sidebar_state="expanded"
35
  )
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  # Sidebar for model selection and weights
38
  with st.sidebar:
39
  st.title("Configuration")
@@ -63,9 +95,15 @@ with st.sidebar:
63
  use_explanation = st.checkbox("Generate Explanations", value=True)
64
  use_faiss = st.checkbox("Use FAISS for fast search", value=True)
65
 
 
 
 
 
 
 
66
  # Memory optimization options
67
  st.subheader("Memory Optimization")
68
- memory_optimization = st.checkbox("Enable memory optimization (for large datasets)", value=False)
69
  clear_embeddings = st.checkbox("Clear embeddings after processing", value=False)
70
  gc_collect_interval = st.number_input(
71
  "Garbage collection interval (files)",
@@ -95,31 +133,59 @@ if 'faiss_index' not in st.session_state:
95
  st.session_state.faiss_index = None
96
  if 'explanation_generator' not in st.session_state:
97
  st.session_state.explanation_generator = None
 
 
98
 
99
  class ResumeScreener:
100
- def __init__(self, embedding_model_name="nvidia/NV-Embed-v2", explanation_model_name="Qwen/QwQ-32B"):
101
  """Initialize the ResumeScreener with the specified embedding model"""
102
  self.embedding_model_name = embedding_model_name
103
  self.explanation_model_name = explanation_model_name
104
  self.model = None
105
  self.tokenizer = None
 
106
  self.faiss_index = None
107
  self.embedding_size = None
108
  self.explanation_generator = None
109
 
 
 
 
 
 
 
 
110
  def load_model(self):
111
  """Load the embedding model from Hugging Face"""
112
  if st.session_state.embedding_model is None:
113
  with st.spinner(f"Loading model {self.embedding_model_name}..."):
114
  try:
115
- if "sentence-transformers" in self.embedding_model_name:
116
- self.model = SentenceTransformer(self.embedding_model_name)
117
- else:
118
- self.tokenizer = AutoTokenizer.from_pretrained(self.embedding_model_name, trust_remote_code=True)
119
- self.model = AutoModel.from_pretrained(self.embedding_model_name, trust_remote_code=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
  st.session_state.embedding_model = self.model
122
  st.session_state.tokenizer = self.tokenizer
 
 
123
 
124
  # Get embedding size
125
  if "sentence-transformers" in self.embedding_model_name:
@@ -134,11 +200,18 @@ class ResumeScreener:
134
  else:
135
  self.model = st.session_state.embedding_model
136
  self.tokenizer = st.session_state.tokenizer
137
-
138
- # Initialize explanation generator if needed
 
 
 
139
  if use_explanation and st.session_state.explanation_generator is None:
140
- st.session_state.explanation_generator = ExplanationGenerator(self.explanation_model_name)
141
- self.explanation_generator = st.session_state.explanation_generator
 
 
 
 
142
  elif use_explanation:
143
  self.explanation_generator = st.session_state.explanation_generator
144
 
@@ -186,6 +259,29 @@ class ResumeScreener:
186
 
187
  def get_embedding(self, text):
188
  """Generate text embedding for a given text"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  if "sentence-transformers" in self.embedding_model_name:
190
  # For sentence-transformers models
191
  embedding = self.model.encode([text], convert_to_tensor=True, show_progress_bar=False)[0]
@@ -476,8 +572,13 @@ def get_huggingface_spaces_datasets():
476
  st.title("Resume Screener & Skill Extractor")
477
  st.markdown("---")
478
 
479
- # Initialize the resume screener
480
- screener = ResumeScreener(embedding_model_name, explanation_model_name)
 
 
 
 
 
481
 
482
  # Job description input
483
  st.header("1. Enter Job Description")
 
34
  initial_sidebar_state="expanded"
35
  )
36
 
37
+ # Hugging Face Spaces optimization
38
+ RUNNING_ON_SPACES = os.environ.get('SPACE_ID') is not None
39
+ if RUNNING_ON_SPACES:
40
+ st.sidebar.info("🚀 Running on Hugging Face Spaces")
41
+
42
+ # Set up cache directory structure
43
+ CACHE_DIR = os.path.join(os.getcwd(), ".cache")
44
+ HF_HOME = os.path.join(CACHE_DIR, "huggingface")
45
+ os.environ['TRANSFORMERS_CACHE'] = os.path.join(HF_HOME, "transformers")
46
+ os.environ['HF_HOME'] = HF_HOME
47
+ os.environ['HF_DATASETS_CACHE'] = os.path.join(HF_HOME, "datasets")
48
+
49
+ # Create cache directories if they don't exist
50
+ for dir_path in [CACHE_DIR, HF_HOME, os.environ['TRANSFORMERS_CACHE'], os.environ['HF_DATASETS_CACHE']]:
51
+ if not os.path.exists(dir_path):
52
+ os.makedirs(dir_path)
53
+
54
+ # Use downloaded models if available (avoid downloading on every run)
55
+ os.environ['TRANSFORMERS_OFFLINE'] = '1'
56
+
57
+ # Spaces optimization flags
58
+ USE_PIPELINE = True
59
+ OPTIMIZE_MEMORY = True
60
+
61
+ # Print setup information
62
+ print(f"Running on Hugging Face Spaces: {os.environ.get('SPACE_ID')}")
63
+ print(f"Cache directory: {CACHE_DIR}")
64
+ print(f"HF Home: {HF_HOME}")
65
+ else:
66
+ USE_PIPELINE = False
67
+ OPTIMIZE_MEMORY = False
68
+
69
  # Sidebar for model selection and weights
70
  with st.sidebar:
71
  st.title("Configuration")
 
95
  use_explanation = st.checkbox("Generate Explanations", value=True)
96
  use_faiss = st.checkbox("Use FAISS for fast search", value=True)
97
 
98
+ # Hugging Face Spaces optimization options
99
+ if not RUNNING_ON_SPACES:
100
+ st.subheader("Hugging Face Spaces Optimization")
101
+ USE_PIPELINE = st.checkbox("Use pipeline API for faster loading", value=USE_PIPELINE)
102
+ OPTIMIZE_MEMORY = st.checkbox("Optimize memory usage", value=OPTIMIZE_MEMORY)
103
+
104
  # Memory optimization options
105
  st.subheader("Memory Optimization")
106
+ memory_optimization = st.checkbox("Enable memory optimization (for large datasets)", value=OPTIMIZE_MEMORY)
107
  clear_embeddings = st.checkbox("Clear embeddings after processing", value=False)
108
  gc_collect_interval = st.number_input(
109
  "Garbage collection interval (files)",
 
133
  st.session_state.faiss_index = None
134
  if 'explanation_generator' not in st.session_state:
135
  st.session_state.explanation_generator = None
136
+ if 'screener' not in st.session_state:
137
+ st.session_state.screener = None
138
 
139
  class ResumeScreener:
140
+ def __init__(self, embedding_model_name="nvidia/NV-Embed-v2", explanation_model_name="Qwen/QwQ-32B", load_immediately=True):
141
  """Initialize the ResumeScreener with the specified embedding model"""
142
  self.embedding_model_name = embedding_model_name
143
  self.explanation_model_name = explanation_model_name
144
  self.model = None
145
  self.tokenizer = None
146
+ self.embedding_pipeline = None
147
  self.faiss_index = None
148
  self.embedding_size = None
149
  self.explanation_generator = None
150
 
151
+ # Load models immediately if requested
152
+ if load_immediately:
153
+ with st.spinner("Loading models at startup..."):
154
+ self.load_model()
155
+ if use_explanation:
156
+ self.load_explanation_generator()
157
+
158
  def load_model(self):
159
  """Load the embedding model from Hugging Face"""
160
  if st.session_state.embedding_model is None:
161
  with st.spinner(f"Loading model {self.embedding_model_name}..."):
162
  try:
163
+ # First try to use pipeline for more efficient loading
164
+ try:
165
+ from transformers import pipeline
166
+ self.embedding_pipeline = pipeline(
167
+ "feature-extraction",
168
+ model=self.embedding_model_name,
169
+ trust_remote_code=True,
170
+ device_map="auto"
171
+ )
172
+ print(f"Successfully loaded {self.embedding_model_name} with pipeline API")
173
+ self.model = self.embedding_pipeline.model
174
+ self.tokenizer = self.embedding_pipeline.tokenizer
175
+ except Exception as pipe_e:
176
+ print(f"Error loading with pipeline API: {str(pipe_e)}")
177
+ print("Falling back to direct model loading...")
178
+
179
+ if "sentence-transformers" in self.embedding_model_name:
180
+ self.model = SentenceTransformer(self.embedding_model_name)
181
+ else:
182
+ self.tokenizer = AutoTokenizer.from_pretrained(self.embedding_model_name, trust_remote_code=True)
183
+ self.model = AutoModel.from_pretrained(self.embedding_model_name, trust_remote_code=True)
184
 
185
  st.session_state.embedding_model = self.model
186
  st.session_state.tokenizer = self.tokenizer
187
+ if self.embedding_pipeline:
188
+ st.session_state.embedding_pipeline = self.embedding_pipeline
189
 
190
  # Get embedding size
191
  if "sentence-transformers" in self.embedding_model_name:
 
200
  else:
201
  self.model = st.session_state.embedding_model
202
  self.tokenizer = st.session_state.tokenizer
203
+ if 'embedding_pipeline' in st.session_state:
204
+ self.embedding_pipeline = st.session_state.embedding_pipeline
205
+
206
+ def load_explanation_generator(self):
207
+ """Load the explanation generator if needed"""
208
  if use_explanation and st.session_state.explanation_generator is None:
209
+ with st.spinner(f"Loading explanation model {self.explanation_model_name}..."):
210
+ st.session_state.explanation_generator = ExplanationGenerator(
211
+ self.explanation_model_name,
212
+ load_immediately=True
213
+ )
214
+ self.explanation_generator = st.session_state.explanation_generator
215
  elif use_explanation:
216
  self.explanation_generator = st.session_state.explanation_generator
217
 
 
259
 
260
  def get_embedding(self, text):
261
  """Generate text embedding for a given text"""
262
+ # Try using pipeline first if available
263
+ if self.embedding_pipeline:
264
+ try:
265
+ # Pipeline returns list of list of embeddings, we want just one vector
266
+ embeddings = self.embedding_pipeline(
267
+ text,
268
+ padding=True,
269
+ truncation=True,
270
+ max_length=512
271
+ )
272
+ # Mean pooling across token dimension for BERT-like models
273
+ embedding_np = np.mean(embeddings[0], axis=0)
274
+
275
+ # Set embedding size if not set
276
+ if self.embedding_size is None:
277
+ self.embedding_size = embedding_np.shape[0]
278
+
279
+ return embedding_np
280
+ except Exception as e:
281
+ print(f"Error using embedding pipeline: {str(e)}")
282
+ print("Falling back to direct embedding method...")
283
+
284
+ # Fall back to original method
285
  if "sentence-transformers" in self.embedding_model_name:
286
  # For sentence-transformers models
287
  embedding = self.model.encode([text], convert_to_tensor=True, show_progress_bar=False)[0]
 
572
  st.title("Resume Screener & Skill Extractor")
573
  st.markdown("---")
574
 
575
+ # Initialize the resume screener at startup
576
+ if st.session_state.screener is None:
577
+ with st.spinner("Initializing Resume Screener..."):
578
+ screener = ResumeScreener(embedding_model_name, explanation_model_name, load_immediately=True)
579
+ st.session_state.screener = screener
580
+ else:
581
+ screener = st.session_state.screener
582
 
583
  # Job description input
584
  st.header("1. Enter Job Description")
explanation_generator.py CHANGED
@@ -6,52 +6,95 @@ using the QwQ-32B model from Hugging Face.
6
  """
7
 
8
  import torch
9
- from transformers import AutoModelForCausalLM, AutoTokenizer
10
  import os
11
  import re
12
 
13
  class ExplanationGenerator:
14
- def __init__(self, model_name="Qwen/QwQ-32B"):
15
  """Initialize the explanation generator with the specified model"""
16
  self.model_name = model_name
17
  self.model = None
18
  self.tokenizer = None
 
19
  self.initialized = False
20
 
 
 
 
 
21
  def load_model(self):
22
  """Load the model and tokenizer if not already loaded"""
23
  if not self.initialized:
24
  try:
25
- # Check if we have enough VRAM for loading the model
26
- if torch.cuda.is_available():
27
- gpu_memory = torch.cuda.get_device_properties(0).total_memory
28
- # QwQ-32B requires at least 32GB VRAM for full precision
29
- if gpu_memory >= 32 * (1024**3): # 32 GB
30
- device = "cuda"
31
- else:
32
- device = "cpu"
33
- else:
34
- device = "cpu"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  # Load tokenizer
37
- self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
 
 
 
38
 
39
- # Load model based on available resources
40
- if device == "cuda":
41
  self.model = AutoModelForCausalLM.from_pretrained(
42
  self.model_name,
43
- torch_dtype=torch.bfloat16,
44
  device_map="auto",
45
- trust_remote_code=True
 
46
  )
47
- else:
48
- # Fall back to a simpler template-based solution if we can't load the model
49
- self.model = None
50
- print("Warning: Loading QwQ-32B on CPU is not recommended. Using template-based explanations instead.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  self.initialized = True
53
  except Exception as e:
54
- print(f"Error loading QwQ-32B model: {str(e)}")
55
  print("Falling back to template-based explanations.")
56
  self.model = None
57
  self.initialized = True
@@ -68,32 +111,46 @@ class ExplanationGenerator:
68
  # Prepare prompt for QwQ-32B
69
  prompt = self._create_prompt(resume_text, job_description, score, semantic_score, keyword_score, skills)
70
 
71
- # Create messages for chat format
72
- messages = [
73
- {"role": "user", "content": prompt}
74
- ]
75
-
76
- # Apply chat template
77
- text = self.tokenizer.apply_chat_template(
78
- messages,
79
- tokenize=False,
80
- add_generation_prompt=True
81
- )
82
-
83
- # Tokenize
84
- inputs = self.tokenizer(text, return_tensors="pt").to(self.model.device)
85
-
86
- # Generate response
87
- output_ids = self.model.generate(
88
- **inputs,
89
- max_new_tokens=300,
90
- temperature=0.6,
91
- top_p=0.95,
92
- top_k=30
93
- )
94
-
95
- # Decode the response
96
- response = self.tokenizer.decode(output_ids[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
  # Clean up the response
99
  cleaned_response = self._clean_response(response)
@@ -101,7 +158,7 @@ class ExplanationGenerator:
101
  return cleaned_response
102
 
103
  except Exception as e:
104
- print(f"Error generating explanation with QwQ-32B: {str(e)}")
105
  # Fall back to template-based explanation
106
  return self._generate_template_explanation(score, semantic_score, keyword_score, skills)
107
  else:
 
6
  """
7
 
8
  import torch
9
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
10
  import os
11
  import re
12
 
13
  class ExplanationGenerator:
14
+ def __init__(self, model_name="Qwen/QwQ-32B", load_immediately=True):
15
  """Initialize the explanation generator with the specified model"""
16
  self.model_name = model_name
17
  self.model = None
18
  self.tokenizer = None
19
+ self.text_generation_pipeline = None
20
  self.initialized = False
21
 
22
+ # Load model immediately if requested
23
+ if load_immediately:
24
+ self.load_model()
25
+
26
  def load_model(self):
27
  """Load the model and tokenizer if not already loaded"""
28
  if not self.initialized:
29
  try:
30
+ print(f"Loading explanation model: {self.model_name}")
31
+
32
+ # Set up 4-bit quantization configuration
33
+ quantization_config = BitsAndBytesConfig(
34
+ load_in_4bit=True,
35
+ bnb_4bit_compute_dtype=torch.bfloat16,
36
+ bnb_4bit_use_double_quant=True,
37
+ bnb_4bit_quant_type="nf4"
38
+ )
39
+
40
+ # Try using pipeline API for more efficient loading in Spaces
41
+ try:
42
+ print("Attempting to load model with pipeline API...")
43
+ self.text_generation_pipeline = pipeline(
44
+ "text-generation",
45
+ model=self.model_name,
46
+ torch_dtype=torch.bfloat16,
47
+ device_map="auto",
48
+ trust_remote_code=True,
49
+ quantization_config=quantization_config,
50
+ model_kwargs={"attn_implementation": "eager"} # Uses less memory
51
+ )
52
+ print(f"Successfully loaded {self.model_name} with pipeline API")
53
+ # Pipeline includes both model and tokenizer
54
+ self.tokenizer = self.text_generation_pipeline.tokenizer
55
+ self.model = self.text_generation_pipeline.model
56
+ self.initialized = True
57
+ return
58
+ except Exception as pipe_e:
59
+ print(f"Error loading with pipeline API: {str(pipe_e)}")
60
+ print("Falling back to direct model loading...")
61
 
62
  # Load tokenizer
63
+ self.tokenizer = AutoTokenizer.from_pretrained(
64
+ self.model_name,
65
+ trust_remote_code=True
66
+ )
67
 
68
+ # Try to load model with 4-bit quantization
69
+ try:
70
  self.model = AutoModelForCausalLM.from_pretrained(
71
  self.model_name,
 
72
  device_map="auto",
73
+ trust_remote_code=True,
74
+ quantization_config=quantization_config
75
  )
76
+ print(f"Successfully loaded {self.model_name} with 4-bit quantization")
77
+ except Exception as quant_e:
78
+ print(f"Error loading with 4-bit quantization: {str(quant_e)}")
79
+ print("Trying to load model with 8-bit quantization...")
80
+
81
+ # Fall back to 8-bit or CPU if 4-bit fails
82
+ if torch.cuda.is_available():
83
+ self.model = AutoModelForCausalLM.from_pretrained(
84
+ self.model_name,
85
+ device_map="auto",
86
+ trust_remote_code=True,
87
+ load_in_8bit=True
88
+ )
89
+ print(f"Successfully loaded {self.model_name} with 8-bit quantization")
90
+ else:
91
+ # Fall back to template-based solution if no GPU
92
+ self.model = None
93
+ print(f"Warning: Loading {self.model_name} on CPU is not recommended. Using template-based explanations instead.")
94
 
95
  self.initialized = True
96
  except Exception as e:
97
+ print(f"Error loading explanation model: {str(e)}")
98
  print("Falling back to template-based explanations.")
99
  self.model = None
100
  self.initialized = True
 
111
  # Prepare prompt for QwQ-32B
112
  prompt = self._create_prompt(resume_text, job_description, score, semantic_score, keyword_score, skills)
113
 
114
+ # Use pipeline API if available
115
+ if self.text_generation_pipeline is not None:
116
+ outputs = self.text_generation_pipeline(
117
+ prompt,
118
+ max_new_tokens=300,
119
+ temperature=0.6,
120
+ top_p=0.95,
121
+ top_k=30,
122
+ do_sample=True,
123
+ return_full_text=False
124
+ )
125
+ response = outputs[0]['generated_text']
126
+
127
+ else:
128
+ # Create messages for chat format
129
+ messages = [
130
+ {"role": "user", "content": prompt}
131
+ ]
132
+
133
+ # Apply chat template
134
+ text = self.tokenizer.apply_chat_template(
135
+ messages,
136
+ tokenize=False,
137
+ add_generation_prompt=True
138
+ )
139
+
140
+ # Tokenize
141
+ inputs = self.tokenizer(text, return_tensors="pt").to(self.model.device)
142
+
143
+ # Generate response
144
+ output_ids = self.model.generate(
145
+ **inputs,
146
+ max_new_tokens=300,
147
+ temperature=0.6,
148
+ top_p=0.95,
149
+ top_k=30
150
+ )
151
+
152
+ # Decode the response
153
+ response = self.tokenizer.decode(output_ids[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
154
 
155
  # Clean up the response
156
  cleaned_response = self._clean_response(response)
 
158
  return cleaned_response
159
 
160
  except Exception as e:
161
+ print(f"Error generating explanation with model: {str(e)}")
162
  # Fall back to template-based explanation
163
  return self._generate_template_explanation(score, semantic_score, keyword_score, skills)
164
  else:
requirements.txt CHANGED
@@ -16,3 +16,5 @@ numpy==1.24.3
16
  tqdm==4.66.1
17
  huggingface-hub==0.25.0
18
  einops
 
 
 
16
  tqdm==4.66.1
17
  huggingface-hub==0.25.0
18
  einops
19
+ bitsandbytes>=0.41.0
20
+ accelerate>=0.21.0