root commited on
Commit
0bfe6dd
·
1 Parent(s): ba2dfe6
Files changed (3) hide show
  1. app.py +39 -153
  2. explanation_generator.py +74 -132
  3. requirements.txt +1 -1
app.py CHANGED
@@ -26,6 +26,20 @@ try:
26
  except LookupError:
27
  nltk.download('punkt')
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  # Set page configuration
30
  st.set_page_config(
31
  page_title="Resume Screener & Skill Extractor",
@@ -34,38 +48,6 @@ st.set_page_config(
34
  initial_sidebar_state="expanded"
35
  )
36
 
37
- # Hugging Face Spaces optimization
38
- RUNNING_ON_SPACES = os.environ.get('SPACE_ID') is not None
39
- if RUNNING_ON_SPACES:
40
- st.sidebar.info("🚀 Running on Hugging Face Spaces")
41
-
42
- # Set up cache directory structure
43
- CACHE_DIR = os.path.join(os.getcwd(), ".cache")
44
- HF_HOME = os.path.join(CACHE_DIR, "huggingface")
45
- os.environ['TRANSFORMERS_CACHE'] = os.path.join(HF_HOME, "transformers")
46
- os.environ['HF_HOME'] = HF_HOME
47
- os.environ['HF_DATASETS_CACHE'] = os.path.join(HF_HOME, "datasets")
48
-
49
- # Create cache directories if they don't exist
50
- for dir_path in [CACHE_DIR, HF_HOME, os.environ['TRANSFORMERS_CACHE'], os.environ['HF_DATASETS_CACHE']]:
51
- if not os.path.exists(dir_path):
52
- os.makedirs(dir_path)
53
-
54
- # Use downloaded models if available (avoid downloading on every run)
55
- os.environ['TRANSFORMERS_OFFLINE'] = '1'
56
-
57
- # Spaces optimization flags
58
- USE_PIPELINE = True
59
- OPTIMIZE_MEMORY = True
60
-
61
- # Print setup information
62
- print(f"Running on Hugging Face Spaces: {os.environ.get('SPACE_ID')}")
63
- print(f"Cache directory: {CACHE_DIR}")
64
- print(f"HF Home: {HF_HOME}")
65
- else:
66
- USE_PIPELINE = False
67
- OPTIMIZE_MEMORY = False
68
-
69
  # Sidebar for model selection and weights
70
  with st.sidebar:
71
  st.title("Configuration")
@@ -95,15 +77,9 @@ with st.sidebar:
95
  use_explanation = st.checkbox("Generate Explanations", value=True)
96
  use_faiss = st.checkbox("Use FAISS for fast search", value=True)
97
 
98
- # Hugging Face Spaces optimization options
99
- if not RUNNING_ON_SPACES:
100
- st.subheader("Hugging Face Spaces Optimization")
101
- USE_PIPELINE = st.checkbox("Use pipeline API for faster loading", value=USE_PIPELINE)
102
- OPTIMIZE_MEMORY = st.checkbox("Optimize memory usage", value=OPTIMIZE_MEMORY)
103
-
104
  # Memory optimization options
105
  st.subheader("Memory Optimization")
106
- memory_optimization = st.checkbox("Enable memory optimization (for large datasets)", value=OPTIMIZE_MEMORY)
107
  clear_embeddings = st.checkbox("Clear embeddings after processing", value=False)
108
  gc_collect_interval = st.number_input(
109
  "Garbage collection interval (files)",
@@ -126,92 +102,31 @@ if 'job_description' not in st.session_state:
126
  if 'results' not in st.session_state:
127
  st.session_state.results = []
128
  if 'embedding_model' not in st.session_state:
129
- st.session_state.embedding_model = None
130
  if 'tokenizer' not in st.session_state:
131
- st.session_state.tokenizer = None
132
  if 'faiss_index' not in st.session_state:
133
  st.session_state.faiss_index = None
134
  if 'explanation_generator' not in st.session_state:
135
  st.session_state.explanation_generator = None
136
- if 'screener' not in st.session_state:
137
- st.session_state.screener = None
138
 
139
  class ResumeScreener:
140
- def __init__(self, embedding_model_name="nvidia/NV-Embed-v2", explanation_model_name="Qwen/QwQ-32B", load_immediately=True):
141
  """Initialize the ResumeScreener with the specified embedding model"""
142
  self.embedding_model_name = embedding_model_name
143
  self.explanation_model_name = explanation_model_name
144
- self.model = None
145
- self.tokenizer = None
146
- self.embedding_pipeline = None
147
  self.faiss_index = None
148
  self.embedding_size = None
149
  self.explanation_generator = None
150
 
151
- # Load models immediately if requested
152
- if load_immediately:
153
- with st.spinner("Loading models at startup..."):
154
- self.load_model()
155
- if use_explanation:
156
- self.load_explanation_generator()
157
-
158
- def load_model(self):
159
- """Load the embedding model from Hugging Face"""
160
- if st.session_state.embedding_model is None:
161
- with st.spinner(f"Loading model {self.embedding_model_name}..."):
162
- try:
163
- # First try to use pipeline for more efficient loading
164
- try:
165
- from transformers import pipeline
166
- self.embedding_pipeline = pipeline(
167
- "feature-extraction",
168
- model=self.embedding_model_name,
169
- trust_remote_code=True,
170
- device_map="auto"
171
- )
172
- print(f"Successfully loaded {self.embedding_model_name} with pipeline API")
173
- self.model = self.embedding_pipeline.model
174
- self.tokenizer = self.embedding_pipeline.tokenizer
175
- except Exception as pipe_e:
176
- print(f"Error loading with pipeline API: {str(pipe_e)}")
177
- print("Falling back to direct model loading...")
178
-
179
- if "sentence-transformers" in self.embedding_model_name:
180
- self.model = SentenceTransformer(self.embedding_model_name)
181
- else:
182
- self.tokenizer = AutoTokenizer.from_pretrained(self.embedding_model_name, trust_remote_code=True)
183
- self.model = AutoModel.from_pretrained(self.embedding_model_name, trust_remote_code=True)
184
-
185
- st.session_state.embedding_model = self.model
186
- st.session_state.tokenizer = self.tokenizer
187
- if self.embedding_pipeline:
188
- st.session_state.embedding_pipeline = self.embedding_pipeline
189
-
190
- # Get embedding size
191
- if "sentence-transformers" in self.embedding_model_name:
192
- self.embedding_size = self.model.get_sentence_embedding_dimension()
193
- else:
194
- # For non-sentence-transformers, we'll determine this after first embedding
195
- pass
196
-
197
- except Exception as e:
198
- st.error(f"Error loading model: {str(e)}")
199
- st.stop()
200
- else:
201
- self.model = st.session_state.embedding_model
202
- self.tokenizer = st.session_state.tokenizer
203
- if 'embedding_pipeline' in st.session_state:
204
- self.embedding_pipeline = st.session_state.embedding_pipeline
205
-
206
- def load_explanation_generator(self):
207
- """Load the explanation generator if needed"""
208
  if use_explanation and st.session_state.explanation_generator is None:
209
- with st.spinner(f"Loading explanation model {self.explanation_model_name}..."):
210
- st.session_state.explanation_generator = ExplanationGenerator(
211
- self.explanation_model_name,
212
- load_immediately=True
213
- )
214
- self.explanation_generator = st.session_state.explanation_generator
215
  elif use_explanation:
216
  self.explanation_generator = st.session_state.explanation_generator
217
 
@@ -259,42 +174,18 @@ class ResumeScreener:
259
 
260
  def get_embedding(self, text):
261
  """Generate text embedding for a given text"""
262
- # Try using pipeline first if available
263
- if self.embedding_pipeline:
264
- try:
265
- # Pipeline returns list of list of embeddings, we want just one vector
266
- embeddings = self.embedding_pipeline(
267
- text,
268
- padding=True,
269
- truncation=True,
270
- max_length=512
271
- )
272
- # Mean pooling across token dimension for BERT-like models
273
- embedding_np = np.mean(embeddings[0], axis=0)
274
-
275
- # Set embedding size if not set
276
- if self.embedding_size is None:
277
- self.embedding_size = embedding_np.shape[0]
278
-
279
- return embedding_np
280
- except Exception as e:
281
- print(f"Error using embedding pipeline: {str(e)}")
282
- print("Falling back to direct embedding method...")
283
-
284
- # Fall back to original method
285
- if "sentence-transformers" in self.embedding_model_name:
286
- # For sentence-transformers models
287
- embedding = self.model.encode([text], convert_to_tensor=True, show_progress_bar=False)[0]
288
- embedding_np = embedding.cpu().detach().numpy()
289
 
290
- # Set embedding size if not set
291
- if self.embedding_size is None:
292
- self.embedding_size = embedding_np.shape[0]
293
-
294
- return embedding_np
295
- else:
296
  # For HuggingFace models
297
  inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
 
 
 
 
 
298
  with torch.no_grad():
299
  outputs = self.model(**inputs)
300
 
@@ -318,6 +209,9 @@ class ResumeScreener:
318
  self.embedding_size = embedding_np.shape[0]
319
 
320
  return embedding_np
 
 
 
321
 
322
  def create_faiss_index(self, embeddings):
323
  """Create a FAISS index for fast similarity search"""
@@ -572,13 +466,8 @@ def get_huggingface_spaces_datasets():
572
  st.title("Resume Screener & Skill Extractor")
573
  st.markdown("---")
574
 
575
- # Initialize the resume screener at startup
576
- if st.session_state.screener is None:
577
- with st.spinner("Initializing Resume Screener..."):
578
- screener = ResumeScreener(embedding_model_name, explanation_model_name, load_immediately=True)
579
- st.session_state.screener = screener
580
- else:
581
- screener = st.session_state.screener
582
 
583
  # Job description input
584
  st.header("1. Enter Job Description")
@@ -902,9 +791,6 @@ elif upload_option == "Upload from Dataset":
902
 
903
  # Process button
904
  if st.button("Find Top Candidates", disabled=not (job_description and resume_texts)):
905
- with st.spinner("Loading embedding model..."):
906
- screener.load_model()
907
-
908
  with st.spinner("Processing job description and resumes..."):
909
  # Get job description embedding
910
  job_embedding = screener.get_embedding(job_description)
 
26
  except LookupError:
27
  nltk.download('punkt')
28
 
29
+ # Initialize embedding model at startup
30
+ EMBEDDING_MODEL_NAME = "nvidia/NV-Embed-v2"
31
+ print(f"Loading embedding model {EMBEDDING_MODEL_NAME}...")
32
+
33
+ try:
34
+ # Load embedding model and tokenizer
35
+ global_embedding_tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME, trust_remote_code=True)
36
+ global_embedding_model = AutoModel.from_pretrained(EMBEDDING_MODEL_NAME, trust_remote_code=True, device_map="auto")
37
+ print(f"Successfully loaded {EMBEDDING_MODEL_NAME}")
38
+ except Exception as e:
39
+ print(f"Error loading embedding model: {str(e)}")
40
+ global_embedding_tokenizer = None
41
+ global_embedding_model = None
42
+
43
  # Set page configuration
44
  st.set_page_config(
45
  page_title="Resume Screener & Skill Extractor",
 
48
  initial_sidebar_state="expanded"
49
  )
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  # Sidebar for model selection and weights
52
  with st.sidebar:
53
  st.title("Configuration")
 
77
  use_explanation = st.checkbox("Generate Explanations", value=True)
78
  use_faiss = st.checkbox("Use FAISS for fast search", value=True)
79
 
 
 
 
 
 
 
80
  # Memory optimization options
81
  st.subheader("Memory Optimization")
82
+ memory_optimization = st.checkbox("Enable memory optimization (for large datasets)", value=False)
83
  clear_embeddings = st.checkbox("Clear embeddings after processing", value=False)
84
  gc_collect_interval = st.number_input(
85
  "Garbage collection interval (files)",
 
102
  if 'results' not in st.session_state:
103
  st.session_state.results = []
104
  if 'embedding_model' not in st.session_state:
105
+ st.session_state.embedding_model = global_embedding_model
106
  if 'tokenizer' not in st.session_state:
107
+ st.session_state.tokenizer = global_embedding_tokenizer
108
  if 'faiss_index' not in st.session_state:
109
  st.session_state.faiss_index = None
110
  if 'explanation_generator' not in st.session_state:
111
  st.session_state.explanation_generator = None
 
 
112
 
113
  class ResumeScreener:
114
+ def __init__(self, embedding_model_name="nvidia/NV-Embed-v2", explanation_model_name="Qwen/QwQ-32B"):
115
  """Initialize the ResumeScreener with the specified embedding model"""
116
  self.embedding_model_name = embedding_model_name
117
  self.explanation_model_name = explanation_model_name
118
+ # Initialize with preloaded models
119
+ self.model = st.session_state.embedding_model
120
+ self.tokenizer = st.session_state.tokenizer
121
  self.faiss_index = None
122
  self.embedding_size = None
123
  self.explanation_generator = None
124
 
125
+ # Initialize explanation generator
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  if use_explanation and st.session_state.explanation_generator is None:
127
+ with st.spinner("Initializing explanation generator..."):
128
+ st.session_state.explanation_generator = ExplanationGenerator(self.explanation_model_name)
129
+ self.explanation_generator = st.session_state.explanation_generator
 
 
 
130
  elif use_explanation:
131
  self.explanation_generator = st.session_state.explanation_generator
132
 
 
174
 
175
  def get_embedding(self, text):
176
  """Generate text embedding for a given text"""
177
+ if self.model is None:
178
+ st.error("Embedding model not available. Please check your environment.")
179
+ return np.zeros(768) # Default embedding size as fallback
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
 
181
+ try:
 
 
 
 
 
182
  # For HuggingFace models
183
  inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
184
+
185
+ # Move inputs to same device as model
186
+ device = next(self.model.parameters()).device
187
+ inputs = {k: v.to(device) for k, v in inputs.items()}
188
+
189
  with torch.no_grad():
190
  outputs = self.model(**inputs)
191
 
 
209
  self.embedding_size = embedding_np.shape[0]
210
 
211
  return embedding_np
212
+ except Exception as e:
213
+ st.error(f"Error generating embedding: {str(e)}")
214
+ return np.zeros(768) # Default embedding size as fallback
215
 
216
  def create_faiss_index(self, embeddings):
217
  """Create a FAISS index for fast similarity search"""
 
466
  st.title("Resume Screener & Skill Extractor")
467
  st.markdown("---")
468
 
469
+ # Initialize the resume screener
470
+ screener = ResumeScreener(embedding_model_name, explanation_model_name)
 
 
 
 
 
471
 
472
  # Job description input
473
  st.header("1. Enter Job Description")
 
791
 
792
  # Process button
793
  if st.button("Find Top Candidates", disabled=not (job_description and resume_texts)):
 
 
 
794
  with st.spinner("Processing job description and resumes..."):
795
  # Get job description embedding
796
  job_embedding = screener.get_embedding(job_description)
explanation_generator.py CHANGED
@@ -6,151 +6,93 @@ using the QwQ-32B model from Hugging Face.
6
  """
7
 
8
  import torch
9
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
10
  import os
11
  import re
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  class ExplanationGenerator:
14
- def __init__(self, model_name="Qwen/QwQ-32B", load_immediately=True):
15
  """Initialize the explanation generator with the specified model"""
16
  self.model_name = model_name
17
- self.model = None
18
- self.tokenizer = None
19
- self.text_generation_pipeline = None
20
- self.initialized = False
21
 
22
- # Load model immediately if requested
23
- if load_immediately:
24
- self.load_model()
25
-
26
- def load_model(self):
27
- """Load the model and tokenizer if not already loaded"""
28
- if not self.initialized:
29
  try:
30
- print(f"Loading explanation model: {self.model_name}")
31
-
32
- # Set up 4-bit quantization configuration
33
- quantization_config = BitsAndBytesConfig(
34
- load_in_4bit=True,
35
- bnb_4bit_compute_dtype=torch.bfloat16,
36
- bnb_4bit_use_double_quant=True,
37
- bnb_4bit_quant_type="nf4"
38
- )
39
 
40
- # Try using pipeline API for more efficient loading in Spaces
41
- try:
42
- print("Attempting to load model with pipeline API...")
43
- self.text_generation_pipeline = pipeline(
44
- "text-generation",
45
- model=self.model_name,
46
- torch_dtype=torch.bfloat16,
47
- device_map="auto",
48
- trust_remote_code=True,
49
- quantization_config=quantization_config,
50
- model_kwargs={"attn_implementation": "eager"} # Uses less memory
51
- )
52
- print(f"Successfully loaded {self.model_name} with pipeline API")
53
- # Pipeline includes both model and tokenizer
54
- self.tokenizer = self.text_generation_pipeline.tokenizer
55
- self.model = self.text_generation_pipeline.model
56
- self.initialized = True
57
- return
58
- except Exception as pipe_e:
59
- print(f"Error loading with pipeline API: {str(pipe_e)}")
60
- print("Falling back to direct model loading...")
61
 
62
- # Load tokenizer
63
- self.tokenizer = AutoTokenizer.from_pretrained(
64
- self.model_name,
65
- trust_remote_code=True
 
66
  )
67
 
68
- # Try to load model with 4-bit quantization
69
- try:
70
- self.model = AutoModelForCausalLM.from_pretrained(
71
- self.model_name,
72
- device_map="auto",
73
- trust_remote_code=True,
74
- quantization_config=quantization_config
75
- )
76
- print(f"Successfully loaded {self.model_name} with 4-bit quantization")
77
- except Exception as quant_e:
78
- print(f"Error loading with 4-bit quantization: {str(quant_e)}")
79
- print("Trying to load model with 8-bit quantization...")
80
-
81
- # Fall back to 8-bit or CPU if 4-bit fails
82
- if torch.cuda.is_available():
83
- self.model = AutoModelForCausalLM.from_pretrained(
84
- self.model_name,
85
- device_map="auto",
86
- trust_remote_code=True,
87
- load_in_8bit=True
88
- )
89
- print(f"Successfully loaded {self.model_name} with 8-bit quantization")
90
- else:
91
- # Fall back to template-based solution if no GPU
92
- self.model = None
93
- print(f"Warning: Loading {self.model_name} on CPU is not recommended. Using template-based explanations instead.")
94
 
95
- self.initialized = True
96
- except Exception as e:
97
- print(f"Error loading explanation model: {str(e)}")
98
- print("Falling back to template-based explanations.")
99
- self.model = None
100
- self.initialized = True
101
-
102
- def generate_explanation(self, resume_text, job_description, score, semantic_score, keyword_score, skills):
103
- """Generate explanation for why a resume was ranked highly"""
104
- # Check if we need to load the model
105
- if not self.initialized:
106
- self.load_model()
107
-
108
- # If the model is loaded and available, use it for generating explanations
109
- if self.model is not None:
110
- try:
111
- # Prepare prompt for QwQ-32B
112
- prompt = self._create_prompt(resume_text, job_description, score, semantic_score, keyword_score, skills)
113
 
114
- # Use pipeline API if available
115
- if self.text_generation_pipeline is not None:
116
- outputs = self.text_generation_pipeline(
117
- prompt,
118
- max_new_tokens=300,
119
- temperature=0.6,
120
- top_p=0.95,
121
- top_k=30,
122
- do_sample=True,
123
- return_full_text=False
124
- )
125
- response = outputs[0]['generated_text']
126
-
127
- else:
128
- # Create messages for chat format
129
- messages = [
130
- {"role": "user", "content": prompt}
131
- ]
132
-
133
- # Apply chat template
134
- text = self.tokenizer.apply_chat_template(
135
- messages,
136
- tokenize=False,
137
- add_generation_prompt=True
138
- )
139
-
140
- # Tokenize
141
- inputs = self.tokenizer(text, return_tensors="pt").to(self.model.device)
142
-
143
- # Generate response
144
- output_ids = self.model.generate(
145
- **inputs,
146
- max_new_tokens=300,
147
- temperature=0.6,
148
- top_p=0.95,
149
- top_k=30
150
- )
151
-
152
- # Decode the response
153
- response = self.tokenizer.decode(output_ids[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
154
 
155
  # Clean up the response
156
  cleaned_response = self._clean_response(response)
@@ -158,7 +100,7 @@ class ExplanationGenerator:
158
  return cleaned_response
159
 
160
  except Exception as e:
161
- print(f"Error generating explanation with model: {str(e)}")
162
  # Fall back to template-based explanation
163
  return self._generate_template_explanation(score, semantic_score, keyword_score, skills)
164
  else:
 
6
  """
7
 
8
  import torch
9
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
10
  import os
11
  import re
12
 
13
+ # Load QwQ model at initialization time
14
+ print("Loading Qwen/QwQ-32B model with 4-bit quantization...")
15
+ QWQ_MODEL_NAME = "Qwen/QwQ-32B"
16
+
17
+ try:
18
+ # Configure 4-bit quantization for better performance
19
+ quantization_config = BitsAndBytesConfig(
20
+ load_in_4bit=True,
21
+ bnb_4bit_quant_type="nf4",
22
+ bnb_4bit_compute_dtype=torch.float16,
23
+ bnb_4bit_use_double_quant=True
24
+ )
25
+
26
+ # Load QwQ model and tokenizer
27
+ global_qwq_tokenizer = AutoTokenizer.from_pretrained(QWQ_MODEL_NAME, trust_remote_code=True)
28
+ global_qwq_model = None
29
+
30
+ # Check if we have enough resources to load the model
31
+ if torch.cuda.is_available():
32
+ gpu_memory = torch.cuda.get_device_properties(0).total_memory
33
+ if gpu_memory >= 16 * (1024**3): # 16 GB (reduced thanks to quantization)
34
+ global_qwq_model = AutoModelForCausalLM.from_pretrained(
35
+ QWQ_MODEL_NAME,
36
+ quantization_config=quantization_config,
37
+ device_map="auto",
38
+ trust_remote_code=True,
39
+ torch_dtype=torch.float16
40
+ )
41
+ print("Successfully loaded QwQ-32B with 4-bit quantization")
42
+ else:
43
+ print("Not enough GPU memory, using template-based explanations")
44
+ else:
45
+ print("CUDA not available, using template-based explanations")
46
+
47
+ except Exception as e:
48
+ print(f"Error loading QwQ-32B model: {str(e)}")
49
+ print("Falling back to template-based explanations.")
50
+ global_qwq_tokenizer = None
51
+ global_qwq_model = None
52
+
53
  class ExplanationGenerator:
54
+ def __init__(self, model_name="Qwen/QwQ-32B"):
55
  """Initialize the explanation generator with the specified model"""
56
  self.model_name = model_name
57
+ # Use globally pre-loaded model and tokenizer
58
+ self.model = global_qwq_model
59
+ self.tokenizer = global_qwq_tokenizer
60
+ self.initialized = True
61
 
62
+ def generate_explanation(self, resume_text, job_description, score, semantic_score, keyword_score, skills):
63
+ """Generate explanation for why a resume was ranked highly"""
64
+ # Use the model if it's available
65
+ if self.model is not None and self.tokenizer is not None:
 
 
 
66
  try:
67
+ # Prepare prompt for QwQ-32B
68
+ prompt = self._create_prompt(resume_text, job_description, score, semantic_score, keyword_score, skills)
 
 
 
 
 
 
 
69
 
70
+ # Create messages for chat format
71
+ messages = [
72
+ {"role": "user", "content": prompt}
73
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
+ # Apply chat template
76
+ text = self.tokenizer.apply_chat_template(
77
+ messages,
78
+ tokenize=False,
79
+ add_generation_prompt=True
80
  )
81
 
82
+ # Tokenize
83
+ inputs = self.tokenizer(text, return_tensors="pt").to(self.model.device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
+ # Generate response
86
+ output_ids = self.model.generate(
87
+ **inputs,
88
+ max_new_tokens=300,
89
+ temperature=0.6,
90
+ top_p=0.95,
91
+ top_k=30
92
+ )
 
 
 
 
 
 
 
 
 
 
93
 
94
+ # Decode the response
95
+ response = self.tokenizer.decode(output_ids[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
  # Clean up the response
98
  cleaned_response = self._clean_response(response)
 
100
  return cleaned_response
101
 
102
  except Exception as e:
103
+ print(f"Error generating explanation with QwQ-32B: {str(e)}")
104
  # Fall back to template-based explanation
105
  return self._generate_template_explanation(score, semantic_score, keyword_score, skills)
106
  else:
requirements.txt CHANGED
@@ -17,4 +17,4 @@ tqdm==4.66.1
17
  huggingface-hub==0.25.0
18
  einops
19
  bitsandbytes>=0.41.0
20
- accelerate>=0.21.0
 
17
  huggingface-hub==0.25.0
18
  einops
19
  bitsandbytes>=0.41.0
20
+ accelerate>=0.23.0