File size: 10,631 Bytes
d60989a
 
 
 
 
 
 
 
ba2dfe6
d60989a
 
 
 
ba2dfe6
d60989a
 
 
 
ba2dfe6
d60989a
 
ba2dfe6
 
 
 
d60989a
 
 
 
ba2dfe6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d60989a
 
ba2dfe6
 
 
 
d60989a
ba2dfe6
 
d60989a
 
53cdf96
ba2dfe6
 
d60989a
ba2dfe6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d60989a
 
 
ba2dfe6
d60989a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba2dfe6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d60989a
 
 
 
 
 
 
ba2dfe6
d60989a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
"""
Explanation Generator Module

This module handles the generation of explanations for resume rankings
using the QwQ-32B model from Hugging Face.
"""

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
import os
import re

class ExplanationGenerator:
    def __init__(self, model_name="Qwen/QwQ-32B", load_immediately=True):
        """Initialize the explanation generator with the specified model"""
        self.model_name = model_name
        self.model = None
        self.tokenizer = None
        self.text_generation_pipeline = None
        self.initialized = False
        
        # Load model immediately if requested
        if load_immediately:
            self.load_model()
        
    def load_model(self):
        """Load the model and tokenizer if not already loaded"""
        if not self.initialized:
            try:
                print(f"Loading explanation model: {self.model_name}")
                
                # Set up 4-bit quantization configuration
                quantization_config = BitsAndBytesConfig(
                    load_in_4bit=True,
                    bnb_4bit_compute_dtype=torch.bfloat16,
                    bnb_4bit_use_double_quant=True,
                    bnb_4bit_quant_type="nf4"
                )
                
                # Try using pipeline API for more efficient loading in Spaces
                try:
                    print("Attempting to load model with pipeline API...")
                    self.text_generation_pipeline = pipeline(
                        "text-generation",
                        model=self.model_name,
                        torch_dtype=torch.bfloat16,
                        device_map="auto",
                        trust_remote_code=True,
                        quantization_config=quantization_config,
                        model_kwargs={"attn_implementation": "eager"}  # Uses less memory
                    )
                    print(f"Successfully loaded {self.model_name} with pipeline API")
                    # Pipeline includes both model and tokenizer
                    self.tokenizer = self.text_generation_pipeline.tokenizer
                    self.model = self.text_generation_pipeline.model
                    self.initialized = True
                    return
                except Exception as pipe_e:
                    print(f"Error loading with pipeline API: {str(pipe_e)}")
                    print("Falling back to direct model loading...")
                
                # Load tokenizer
                self.tokenizer = AutoTokenizer.from_pretrained(
                    self.model_name, 
                    trust_remote_code=True
                )
                
                # Try to load model with 4-bit quantization
                try:
                    self.model = AutoModelForCausalLM.from_pretrained(
                        self.model_name,
                        device_map="auto",
                        trust_remote_code=True,
                        quantization_config=quantization_config
                    )
                    print(f"Successfully loaded {self.model_name} with 4-bit quantization")
                except Exception as quant_e:
                    print(f"Error loading with 4-bit quantization: {str(quant_e)}")
                    print("Trying to load model with 8-bit quantization...")
                    
                    # Fall back to 8-bit or CPU if 4-bit fails
                    if torch.cuda.is_available():
                        self.model = AutoModelForCausalLM.from_pretrained(
                            self.model_name,
                            device_map="auto",
                            trust_remote_code=True,
                            load_in_8bit=True
                        )
                        print(f"Successfully loaded {self.model_name} with 8-bit quantization")
                    else:
                        # Fall back to template-based solution if no GPU
                        self.model = None
                        print(f"Warning: Loading {self.model_name} on CPU is not recommended. Using template-based explanations instead.")
                
                self.initialized = True
            except Exception as e:
                print(f"Error loading explanation model: {str(e)}")
                print("Falling back to template-based explanations.")
                self.model = None
                self.initialized = True
    
    def generate_explanation(self, resume_text, job_description, score, semantic_score, keyword_score, skills):
        """Generate explanation for why a resume was ranked highly"""
        # Check if we need to load the model
        if not self.initialized:
            self.load_model()
        
        # If the model is loaded and available, use it for generating explanations
        if self.model is not None:
            try:
                # Prepare prompt for QwQ-32B
                prompt = self._create_prompt(resume_text, job_description, score, semantic_score, keyword_score, skills)
                
                # Use pipeline API if available
                if self.text_generation_pipeline is not None:
                    outputs = self.text_generation_pipeline(
                        prompt,
                        max_new_tokens=300,
                        temperature=0.6,
                        top_p=0.95,
                        top_k=30,
                        do_sample=True,
                        return_full_text=False
                    )
                    response = outputs[0]['generated_text']
                    
                else:
                    # Create messages for chat format
                    messages = [
                        {"role": "user", "content": prompt}
                    ]
                    
                    # Apply chat template
                    text = self.tokenizer.apply_chat_template(
                        messages,
                        tokenize=False,
                        add_generation_prompt=True
                    )
                    
                    # Tokenize
                    inputs = self.tokenizer(text, return_tensors="pt").to(self.model.device)
                    
                    # Generate response
                    output_ids = self.model.generate(
                        **inputs,
                        max_new_tokens=300,
                        temperature=0.6,
                        top_p=0.95,
                        top_k=30
                    )
                    
                    # Decode the response
                    response = self.tokenizer.decode(output_ids[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
                
                # Clean up the response
                cleaned_response = self._clean_response(response)
                
                return cleaned_response
                
            except Exception as e:
                print(f"Error generating explanation with model: {str(e)}")
                # Fall back to template-based explanation
                return self._generate_template_explanation(score, semantic_score, keyword_score, skills)
        else:
            # Use template-based explanation if model is not available
            return self._generate_template_explanation(score, semantic_score, keyword_score, skills)
    
    def _create_prompt(self, resume_text, job_description, score, semantic_score, keyword_score, skills):
        """Create a prompt for the explanation generation"""
        # Use only the first 1000 characters of the resume to keep prompt size manageable
        resume_excerpt = resume_text[:1000] + "..." if len(resume_text) > 1000 else resume_text
        
        prompt = f"""You are an AI assistant helping a recruiter understand why a candidate's resume was matched with a job posting.

The resume has been assigned the following scores:
- Overall Match Score: {score:.2f} out of 1.0
- Semantic Relevance Score: {semantic_score:.2f} out of 1.0
- Keyword Match Score: {keyword_score:.2f} out of 1.0

The job description is:
```
{job_description}
```

Based on analysis, the resume contains these skills relevant to the job: {', '.join(skills)}

Resume excerpt:
```
{resume_excerpt}
```

Please provide a short explanation (3-5 sentences) of why this resume received these scores and how well it matches the job requirements. Focus on the relationship between the candidate's experience and the job requirements."""

        return prompt
    
    def _clean_response(self, response):
        """Clean the response from the model"""
        # Remove any thinking or internal processing tokens
        response = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL)
        
        # Limit to a reasonable length
        if len(response) > 500:
            sentences = response.split('.')
            shortened = '.'.join(sentences[:5]) + '.'
            return shortened
        
        return response
    
    def _generate_template_explanation(self, score, semantic_score, keyword_score, skills):
        """Generate a template-based explanation when the model is not available"""
        # Simple template-based explanation
        if score > 0.8:
            quality = "excellent"
        elif score > 0.6:
            quality = "good"
        elif score > 0.4:
            quality = "moderate"
        else:
            quality = "limited"
            
        explanation = f"This resume shows {quality} alignment with the job requirements, with an overall score of {score:.2f}. "
        
        if semantic_score > keyword_score:
            explanation += f"The candidate's experience demonstrates strong semantic relevance ({semantic_score:.2f}) to the position, though specific keyword matches ({keyword_score:.2f}) could be improved. "
        else:
            explanation += f"The resume contains many relevant keywords ({keyword_score:.2f}), but could benefit from better contextual alignment ({semantic_score:.2f}) with the job requirements. "
        
        if skills:
            if len(skills) > 3:
                explanation += f"Key skills identified include {', '.join(skills[:3])}, and {len(skills)-3} others that match the job requirements."
            else:
                explanation += f"Key skills identified include {', '.join(skills)}."
        else:
            explanation += "No specific skills were identified that directly match the requirements."
            
        return explanation