Muhammad541 commited on
Commit
0d77b69
·
verified ·
1 Parent(s): 905ff75

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -38
app.py CHANGED
@@ -8,7 +8,6 @@ import pickle
8
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
9
  import scipy.special
10
  from tqdm import tqdm
11
- from tabulate import tabulate
12
  from sklearn.feature_extraction.text import TfidfVectorizer
13
  from multiprocessing import Pool, cpu_count
14
  from flask import Flask, request, jsonify
@@ -22,10 +21,9 @@ logger = logging.getLogger(__name__)
22
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
23
 
24
  # Paths for saving artifacts
25
- MODEL_DIR = "./saved_models" # Primary location in /app/saved_models
26
- FALLBACK_MODEL_DIR = "/tmp/saved_models" # Fallback if ./saved_models fails
27
 
28
- # Try to use the primary directory, fall back to /tmp if needed
29
  try:
30
  os.makedirs(MODEL_DIR, exist_ok=True)
31
  logger.info(f"Using model directory: {MODEL_DIR}")
@@ -35,13 +33,17 @@ except Exception as e:
35
  os.makedirs(FALLBACK_MODEL_DIR, exist_ok=True)
36
  chosen_model_dir = FALLBACK_MODEL_DIR
37
 
38
- # Update paths based on the chosen directory
39
  UNIVERSAL_MODEL_PATH = os.path.join(chosen_model_dir, "universal_model")
40
  DETECTOR_MODEL_PATH = os.path.join(chosen_model_dir, "detector_model")
41
  TFIDF_PATH = os.path.join(chosen_model_dir, "tfidf_vectorizer.pkl")
42
  SKILL_TFIDF_PATH = os.path.join(chosen_model_dir, "skill_tfidf.pkl")
43
  QUESTION_ANSWER_PATH = os.path.join(chosen_model_dir, "question_to_answer.pkl")
44
  FAISS_INDEX_PATH = os.path.join(chosen_model_dir, "faiss_index.index")
 
 
 
 
45
 
46
  # Improved dataset loading with fallback
47
  def load_dataset(file_path, required_columns=[], fallback_data=None):
@@ -82,7 +84,7 @@ jobs_df = load_dataset("Updated_Job_Posting_Dataset.csv", ["job_title", "company
82
  'location': ['Remote', 'Islamabad', 'Karachi', 'Remote', 'Islamabad'],
83
  'required_skills': ['Linux, Kubernetes', 'AWS, Kubernetes', 'Python, Node.js', 'Python, SQL', 'Cybersecurity, Linux'],
84
  'job_description': ['DevOps role description', 'Cloud architecture position', 'Software engineering role', 'Data science position', 'Security analyst role'],
85
- 'level': ['Intermediate', 'Advanced', 'Intermediate', 'Intermediate', 'Intermediate'] # Added level for job recommendations
86
  })
87
 
88
  # Validate questions_df
@@ -107,17 +109,23 @@ else:
107
  detector_tokenizer = AutoTokenizer.from_pretrained("roberta-base-openai-detector")
108
  detector_model = AutoModelForSequenceClassification.from_pretrained("roberta-base-openai-detector")
109
 
110
- # Precompute Resources with Validation
111
- def resources_valid(saved_skills, current_skills):
112
- return set(saved_skills) == set(current_skills)
 
 
113
 
 
114
  def initialize_resources(user_skills):
115
- global tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, answer_embeddings
116
 
117
  user_skills_lower = [s.lower() for s in user_skills]
118
  needs_recompute = False
119
 
120
- if all(os.path.exists(p) for p in [TFIDF_PATH, SKILL_TFIDF_PATH, QUESTION_ANSWER_PATH, FAISS_INDEX_PATH]):
 
 
 
121
  try:
122
  with open(TFIDF_PATH, 'rb') as f:
123
  tfidf_vectorizer = pickle.load(f)
@@ -126,6 +134,14 @@ def initialize_resources(user_skills):
126
  with open(QUESTION_ANSWER_PATH, 'rb') as f:
127
  question_to_answer = pickle.load(f)
128
  faiss_index = faiss.read_index(FAISS_INDEX_PATH)
 
 
 
 
 
 
 
 
129
 
130
  if set(skill_tfidf.keys()) != set(user_skills_lower):
131
  logger.info("Skill mismatch detected, recomputing resources")
@@ -144,23 +160,34 @@ def initialize_resources(user_skills):
144
 
145
  skill_tfidf = {skill.lower(): tfidf_vectorizer.transform([skill]).toarray()[0] for skill in user_skills}
146
  question_to_answer = dict(zip(questions_df['Question'], questions_df['Answer']))
147
- answer_embeddings = universal_model.encode(list(question_to_answer.values()), convert_to_tensor=True).cpu().numpy()
 
148
 
149
- faiss_index = faiss.IndexFlatL2(answer_embeddings.shape[1])
150
- faiss_index.add(answer_embeddings)
 
 
 
 
 
 
151
 
152
  # Save resources
153
  with open(TFIDF_PATH, 'wb') as f: pickle.dump(tfidf_vectorizer, f)
154
  with open(SKILL_TFIDF_PATH, 'wb') as f: pickle.dump(skill_tfidf, f)
155
  with open(QUESTION_ANSWER_PATH, 'wb') as f: pickle.dump(question_to_answer, f)
156
  faiss.write_index(faiss_index, FAISS_INDEX_PATH)
 
 
 
 
157
  universal_model.save(UNIVERSAL_MODEL_PATH)
158
  logger.info(f"Resources saved to {chosen_model_dir}")
159
 
160
- # Enhanced evaluation with batch processing
161
  def evaluate_response(args):
162
  try:
163
- skill, user_answer, question = args
164
  if not user_answer:
165
  return skill, 0.0, False
166
 
@@ -170,9 +197,7 @@ def evaluate_response(args):
170
  probs = scipy.special.softmax(logits, axis=1).tolist()[0]
171
  is_ai = probs[1] > 0.5
172
 
173
- expected_answer = question_to_answer.get(question, "")
174
- user_embeddings = universal_model.encode([user_answer, expected_answer], batch_size=32, convert_to_tensor=True)
175
- score = util.pytorch_cos_sim(user_embeddings[0], user_embeddings[1]).item() * 100
176
 
177
  user_tfidf = tfidf_vectorizer.transform([user_answer]).toarray()[0]
178
  skill_vec = skill_tfidf.get(skill.lower(), np.zeros_like(user_tfidf))
@@ -184,7 +209,7 @@ def evaluate_response(args):
184
  logger.error(f"Evaluation error for {skill}: {e}")
185
  return skill, 0.0, False
186
 
187
- # Improved course recommendation with batch processing
188
  def recommend_courses(skills_to_improve, user_level, upgrade=False):
189
  try:
190
  if not skills_to_improve or courses_df.empty:
@@ -195,10 +220,8 @@ def recommend_courses(skills_to_improve, user_level, upgrade=False):
195
  if 'completion_rate' not in courses_df:
196
  courses_df['completion_rate'] = 0.7
197
 
198
- # Batch encode skills and courses
199
- skill_embeddings = universal_model.encode(skills_to_improve, batch_size=32, convert_to_tensor=True)
200
- course_embeddings = universal_model.encode(courses_df['skills'].fillna(""), batch_size=32, convert_to_tensor=True)
201
- similarities = util.pytorch_cos_sim(skill_embeddings, course_embeddings).numpy()
202
 
203
  total_scores = 0.6 * similarities + 0.2 * courses_df['popularity'].values + 0.2 * courses_df['completion_rate'].values
204
 
@@ -215,29 +238,24 @@ def recommend_courses(skills_to_improve, user_level, upgrade=False):
215
  logger.error(f"Course recommendation error: {e}")
216
  return []
217
 
218
- # Enhanced job recommendation with fixed level handling
219
  def recommend_jobs(user_skills, user_level):
220
  try:
221
  if jobs_df.empty:
222
  return []
223
 
224
- job_field = 'required_skills' if 'required_skills' in jobs_df.columns else 'job_description'
225
- job_embeddings = universal_model.encode(jobs_df[job_field].fillna(""), batch_size=32, convert_to_tensor=True)
226
- user_embedding = universal_model.encode(" ".join(user_skills), batch_size=32, convert_to_tensor=True)
227
- skill_similarities = util.pytorch_cos_sim(user_embedding, job_embeddings).numpy()[0]
228
 
229
- # Ensure level column exists and is a Series
230
  if 'level' not in jobs_df.columns:
231
  jobs_df['level'] = 'Intermediate'
232
- level_col = jobs_df['level'].astype(str) # Ensure it's a string Series
233
-
234
  level_map = {'Beginner': 0, 'Intermediate': 1, 'Advanced': 2}
235
  user_level_num = level_map.get(user_level, 1)
236
  level_scores = level_col.apply(lambda x: 1 - abs(level_map.get(x, 1) - user_level_num)/2)
237
 
238
  location_pref = jobs_df.get('location', pd.Series(['Remote'] * len(jobs_df))).apply(lambda x: 1.0 if x in ['Islamabad', 'Karachi'] else 0.7)
239
- industry_embeddings = universal_model.encode(jobs_df['job_title'].fillna(""), batch_size=32, convert_to_tensor=True)
240
- industry_similarities = util.pytorch_cos_sim(user_embedding, industry_embeddings).numpy()[0]
241
 
242
  total_job_scores = 0.5 * skill_similarities + 0.2 * level_scores + 0.1 * location_pref + 0.2 * industry_similarities
243
  top_job_indices = np.argsort(-total_job_scores)[:5]
@@ -292,22 +310,38 @@ def assess_skills():
292
  if len(user_questions) != len(user_skills):
293
  return jsonify({"error": f"Internal error: Number of selected questions ({len(user_questions)}) does not match number of skills ({len(user_skills)})."}), 500
294
 
 
 
 
295
  user_responses = []
296
  for idx, row in user_questions.iterrows():
297
  answer = answers[idx]
298
  if not answer or answer.lower() == 'skip':
299
- user_responses.append((row['Skill'], None, row['Question']))
300
  else:
301
- user_responses.append((row['Skill'], answer, row['Question']))
 
 
 
 
 
 
 
 
 
 
 
 
302
 
 
303
  with Pool(processes=min(cpu_count(), 4)) as pool:
304
- eval_args = [(skill, user_code, question) for skill, user_code, question in user_responses if user_code]
305
  results = pool.map(evaluate_response, eval_args)
306
 
307
  user_scores = {}
308
  ai_flags = {}
309
  scores_list = []
310
- skipped_questions = [f"{skill} ({question})" for skill, user_code, question in user_responses if user_code is None]
311
  for skill, score, is_ai in results:
312
  if skill in user_scores:
313
  user_scores[skill] = max(user_scores[skill], score)
 
8
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
9
  import scipy.special
10
  from tqdm import tqdm
 
11
  from sklearn.feature_extraction.text import TfidfVectorizer
12
  from multiprocessing import Pool, cpu_count
13
  from flask import Flask, request, jsonify
 
21
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
22
 
23
  # Paths for saving artifacts
24
+ MODEL_DIR = "./saved_models"
25
+ FALLBACK_MODEL_DIR = "/tmp/saved_models"
26
 
 
27
  try:
28
  os.makedirs(MODEL_DIR, exist_ok=True)
29
  logger.info(f"Using model directory: {MODEL_DIR}")
 
33
  os.makedirs(FALLBACK_MODEL_DIR, exist_ok=True)
34
  chosen_model_dir = FALLBACK_MODEL_DIR
35
 
36
+ # Update paths
37
  UNIVERSAL_MODEL_PATH = os.path.join(chosen_model_dir, "universal_model")
38
  DETECTOR_MODEL_PATH = os.path.join(chosen_model_dir, "detector_model")
39
  TFIDF_PATH = os.path.join(chosen_model_dir, "tfidf_vectorizer.pkl")
40
  SKILL_TFIDF_PATH = os.path.join(chosen_model_dir, "skill_tfidf.pkl")
41
  QUESTION_ANSWER_PATH = os.path.join(chosen_model_dir, "question_to_answer.pkl")
42
  FAISS_INDEX_PATH = os.path.join(chosen_model_dir, "faiss_index.index")
43
+ QUESTION_EMBEDDINGS_PATH = os.path.join(chosen_model_dir, "question_embeddings.pkl")
44
+ COURSE_EMBEDDINGS_PATH = os.path.join(chosen_model_dir, "course_embeddings.pkl")
45
+ JOB_SKILL_EMBEDDINGS_PATH = os.path.join(chosen_model_dir, "job_skill_embeddings.pkl")
46
+ JOB_TITLE_EMBEDDINGS_PATH = os.path.join(chosen_model_dir, "job_title_embeddings.pkl")
47
 
48
  # Improved dataset loading with fallback
49
  def load_dataset(file_path, required_columns=[], fallback_data=None):
 
84
  'location': ['Remote', 'Islamabad', 'Karachi', 'Remote', 'Islamabad'],
85
  'required_skills': ['Linux, Kubernetes', 'AWS, Kubernetes', 'Python, Node.js', 'Python, SQL', 'Cybersecurity, Linux'],
86
  'job_description': ['DevOps role description', 'Cloud architecture position', 'Software engineering role', 'Data science position', 'Security analyst role'],
87
+ 'level': ['Intermediate', 'Advanced', 'Intermediate', 'Intermediate', 'Intermediate']
88
  })
89
 
90
  # Validate questions_df
 
109
  detector_tokenizer = AutoTokenizer.from_pretrained("roberta-base-openai-detector")
110
  detector_model = AutoModelForSequenceClassification.from_pretrained("roberta-base-openai-detector")
111
 
112
+ # Global variables for cached embeddings
113
+ question_embeddings = None
114
+ course_embeddings = None
115
+ job_skill_embeddings = None
116
+ job_title_embeddings = None
117
 
118
+ # Precompute Resources with Caching
119
  def initialize_resources(user_skills):
120
+ global tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, question_embeddings, course_embeddings, job_skill_embeddings, job_title_embeddings
121
 
122
  user_skills_lower = [s.lower() for s in user_skills]
123
  needs_recompute = False
124
 
125
+ # Check if cached resources exist
126
+ required_paths = [TFIDF_PATH, SKILL_TFIDF_PATH, QUESTION_ANSWER_PATH, FAISS_INDEX_PATH,
127
+ QUESTION_EMBEDDINGS_PATH, COURSE_EMBEDDINGS_PATH, JOB_SKILL_EMBEDDINGS_PATH, JOB_TITLE_EMBEDDINGS_PATH]
128
+ if all(os.path.exists(p) for p in required_paths):
129
  try:
130
  with open(TFIDF_PATH, 'rb') as f:
131
  tfidf_vectorizer = pickle.load(f)
 
134
  with open(QUESTION_ANSWER_PATH, 'rb') as f:
135
  question_to_answer = pickle.load(f)
136
  faiss_index = faiss.read_index(FAISS_INDEX_PATH)
137
+ with open(QUESTION_EMBEDDINGS_PATH, 'rb') as f:
138
+ question_embeddings = pickle.load(f)
139
+ with open(COURSE_EMBEDDINGS_PATH, 'rb') as f:
140
+ course_embeddings = pickle.load(f)
141
+ with open(JOB_SKILL_EMBEDDINGS_PATH, 'rb') as f:
142
+ job_skill_embeddings = pickle.load(f)
143
+ with open(JOB_TITLE_EMBEDDINGS_PATH, 'rb') as f:
144
+ job_title_embeddings = pickle.load(f)
145
 
146
  if set(skill_tfidf.keys()) != set(user_skills_lower):
147
  logger.info("Skill mismatch detected, recomputing resources")
 
160
 
161
  skill_tfidf = {skill.lower(): tfidf_vectorizer.transform([skill]).toarray()[0] for skill in user_skills}
162
  question_to_answer = dict(zip(questions_df['Question'], questions_df['Answer']))
163
+ answers = list(question_to_answer.values())
164
+ question_embeddings = universal_model.encode(answers, batch_size=128, convert_to_tensor=True).cpu().numpy()
165
 
166
+ faiss_index = faiss.IndexFlatL2(question_embeddings.shape[1])
167
+ faiss_index.add(question_embeddings)
168
+
169
+ # Precompute embeddings for courses and jobs
170
+ course_embeddings = universal_model.encode(courses_df['skills'].fillna("").tolist(), batch_size=128, convert_to_tensor=True).cpu().numpy()
171
+ job_field = 'required_skills' if 'required_skills' in jobs_df.columns else 'job_description'
172
+ job_skill_embeddings = universal_model.encode(jobs_df[job_field].fillna("").tolist(), batch_size=128, convert_to_tensor=True).cpu().numpy()
173
+ job_title_embeddings = universal_model.encode(jobs_df['job_title'].fillna("").tolist(), batch_size=128, convert_to_tensor=True).cpu().numpy()
174
 
175
  # Save resources
176
  with open(TFIDF_PATH, 'wb') as f: pickle.dump(tfidf_vectorizer, f)
177
  with open(SKILL_TFIDF_PATH, 'wb') as f: pickle.dump(skill_tfidf, f)
178
  with open(QUESTION_ANSWER_PATH, 'wb') as f: pickle.dump(question_to_answer, f)
179
  faiss.write_index(faiss_index, FAISS_INDEX_PATH)
180
+ with open(QUESTION_EMBEDDINGS_PATH, 'wb') as f: pickle.dump(question_embeddings, f)
181
+ with open(COURSE_EMBEDDINGS_PATH, 'wb') as f: pickle.dump(course_embeddings, f)
182
+ with open(JOB_SKILL_EMBEDDINGS_PATH, 'wb') as f: pickle.dump(job_skill_embeddings, f)
183
+ with open(JOB_TITLE_EMBEDDINGS_PATH, 'wb') as f: pickle.dump(job_title_embeddings, f)
184
  universal_model.save(UNIVERSAL_MODEL_PATH)
185
  logger.info(f"Resources saved to {chosen_model_dir}")
186
 
187
+ # Enhanced evaluation with batch processing and parallelization
188
  def evaluate_response(args):
189
  try:
190
+ skill, user_answer, expected_answer, user_answer_embedding, expected_answer_embedding = args
191
  if not user_answer:
192
  return skill, 0.0, False
193
 
 
197
  probs = scipy.special.softmax(logits, axis=1).tolist()[0]
198
  is_ai = probs[1] > 0.5
199
 
200
+ score = util.pytorch_cos_sim(user_answer_embedding, expected_answer_embedding).item() * 100
 
 
201
 
202
  user_tfidf = tfidf_vectorizer.transform([user_answer]).toarray()[0]
203
  skill_vec = skill_tfidf.get(skill.lower(), np.zeros_like(user_tfidf))
 
209
  logger.error(f"Evaluation error for {skill}: {e}")
210
  return skill, 0.0, False
211
 
212
+ # Improved course recommendation with cached embeddings
213
  def recommend_courses(skills_to_improve, user_level, upgrade=False):
214
  try:
215
  if not skills_to_improve or courses_df.empty:
 
220
  if 'completion_rate' not in courses_df:
221
  courses_df['completion_rate'] = 0.7
222
 
223
+ skill_embeddings = universal_model.encode(skills_to_improve, batch_size=128, convert_to_tensor=True)
224
+ similarities = util.pytorch_cos_sim(skill_embeddings, torch.tensor(course_embeddings)).numpy()
 
 
225
 
226
  total_scores = 0.6 * similarities + 0.2 * courses_df['popularity'].values + 0.2 * courses_df['completion_rate'].values
227
 
 
238
  logger.error(f"Course recommendation error: {e}")
239
  return []
240
 
241
+ # Enhanced job recommendation with cached embeddings
242
  def recommend_jobs(user_skills, user_level):
243
  try:
244
  if jobs_df.empty:
245
  return []
246
 
247
+ user_embedding = universal_model.encode(" ".join(user_skills), batch_size=128, convert_to_tensor=True)
248
+ skill_similarities = util.pytorch_cos_sim(user_embedding, torch.tensor(job_skill_embeddings)).numpy()[0]
 
 
249
 
 
250
  if 'level' not in jobs_df.columns:
251
  jobs_df['level'] = 'Intermediate'
252
+ level_col = jobs_df['level'].astype(str)
 
253
  level_map = {'Beginner': 0, 'Intermediate': 1, 'Advanced': 2}
254
  user_level_num = level_map.get(user_level, 1)
255
  level_scores = level_col.apply(lambda x: 1 - abs(level_map.get(x, 1) - user_level_num)/2)
256
 
257
  location_pref = jobs_df.get('location', pd.Series(['Remote'] * len(jobs_df))).apply(lambda x: 1.0 if x in ['Islamabad', 'Karachi'] else 0.7)
258
+ industry_similarities = util.pytorch_cos_sim(user_embedding, torch.tensor(job_title_embeddings)).numpy()[0]
 
259
 
260
  total_job_scores = 0.5 * skill_similarities + 0.2 * level_scores + 0.1 * location_pref + 0.2 * industry_similarities
261
  top_job_indices = np.argsort(-total_job_scores)[:5]
 
310
  if len(user_questions) != len(user_skills):
311
  return jsonify({"error": f"Internal error: Number of selected questions ({len(user_questions)}) does not match number of skills ({len(user_skills)})."}), 500
312
 
313
+ # Batch encode all user answers and expected answers
314
+ user_answers = []
315
+ expected_answers = []
316
  user_responses = []
317
  for idx, row in user_questions.iterrows():
318
  answer = answers[idx]
319
  if not answer or answer.lower() == 'skip':
320
+ user_responses.append((row['Skill'], None, None, None, None))
321
  else:
322
+ user_answers.append(answer)
323
+ expected_answer = question_to_answer.get(row['Question'], "")
324
+ expected_answers.append(expected_answer)
325
+ user_responses.append((row['Skill'], answer, expected_answer, None, None))
326
+
327
+ if user_answers:
328
+ all_embeddings = universal_model.encode(user_answers + expected_answers, batch_size=128, convert_to_tensor=True)
329
+ user_answer_embeddings = all_embeddings[:len(user_answers)]
330
+ expected_answer_embeddings = all_embeddings[len(user_answers):]
331
+
332
+ for idx, (skill, answer, expected, _, _) in enumerate(user_responses):
333
+ if answer:
334
+ user_responses[idx] = (skill, answer, expected, user_answer_embeddings[idx], expected_answer_embeddings[idx])
335
 
336
+ # Parallelize evaluation
337
  with Pool(processes=min(cpu_count(), 4)) as pool:
338
+ eval_args = [response for response in user_responses if response[1]]
339
  results = pool.map(evaluate_response, eval_args)
340
 
341
  user_scores = {}
342
  ai_flags = {}
343
  scores_list = []
344
+ skipped_questions = [f"{skill} ({question})" for skill, user_code, question, _, _ in user_responses if not user_code]
345
  for skill, score, is_ai in results:
346
  if skill in user_scores:
347
  user_scores[skill] = max(user_scores[skill], score)