Muhammad541 commited on
Commit
729d876
·
verified ·
1 Parent(s): 0d77b69

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +126 -189
app.py CHANGED
@@ -7,9 +7,7 @@ import numpy as np
7
  import pickle
8
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
9
  import scipy.special
10
- from tqdm import tqdm
11
  from sklearn.feature_extraction.text import TfidfVectorizer
12
- from multiprocessing import Pool, cpu_count
13
  from flask import Flask, request, jsonify
14
  import logging
15
 
@@ -40,67 +38,48 @@ TFIDF_PATH = os.path.join(chosen_model_dir, "tfidf_vectorizer.pkl")
40
  SKILL_TFIDF_PATH = os.path.join(chosen_model_dir, "skill_tfidf.pkl")
41
  QUESTION_ANSWER_PATH = os.path.join(chosen_model_dir, "question_to_answer.pkl")
42
  FAISS_INDEX_PATH = os.path.join(chosen_model_dir, "faiss_index.index")
43
- QUESTION_EMBEDDINGS_PATH = os.path.join(chosen_model_dir, "question_embeddings.pkl")
44
- COURSE_EMBEDDINGS_PATH = os.path.join(chosen_model_dir, "course_embeddings.pkl")
45
- JOB_SKILL_EMBEDDINGS_PATH = os.path.join(chosen_model_dir, "job_skill_embeddings.pkl")
46
- JOB_TITLE_EMBEDDINGS_PATH = os.path.join(chosen_model_dir, "job_title_embeddings.pkl")
47
-
48
- # Improved dataset loading with fallback
49
- def load_dataset(file_path, required_columns=[], fallback_data=None):
50
- try:
51
- df = pd.read_csv(file_path)
52
- for col in required_columns:
53
- if col not in df.columns:
54
- logger.warning(f"Column '{col}' missing in {file_path}. Using default values.")
55
- df[col] = ""
56
- return df
57
- except Exception as e:
58
- logger.error(f"Error loading {file_path}: {e}")
59
- if fallback_data is not None:
60
- logger.info(f"Using fallback data for {file_path}")
61
- return pd.DataFrame(fallback_data)
62
- return None
63
-
64
- # Load datasets with fallbacks
65
- questions_df = load_dataset("Generated_Skill-Based_Questions.csv", ["Skill", "Question", "Answer"], {
66
- 'Skill': ['Linux', 'Git', 'Node.js', 'Python', 'Kubernetes'],
67
- 'Question': ['Advanced Linux question', 'Advanced Git question', 'Basic Node.js question',
68
- 'Intermediate Python question', 'Basic Kubernetes question'],
69
- 'Answer': ['Linux answer', 'Git answer', 'Node.js answer', 'Python answer', 'Kubernetes answer']
70
- })
71
-
72
- courses_df = load_dataset("coursera_course_dataset_v2_no_null.csv", ["skills", "course_title", "Organization", "level"], {
73
- 'skills': ['Linux', 'Git', 'Node.js', 'Python', 'Kubernetes'],
74
- 'course_title': ['Linux Admin', 'Git Mastery', 'Node.js Advanced', 'Python for Data', 'Kubernetes Basics'],
75
- 'Organization': ['Coursera', 'Udemy', 'Pluralsight', 'edX', 'Linux Foundation'],
76
- 'level': ['Intermediate', 'Intermediate', 'Advanced', 'Advanced', 'Intermediate'],
77
- 'popularity': [0.85, 0.9, 0.8, 0.95, 0.9],
78
- 'completion_rate': [0.65, 0.7, 0.6, 0.8, 0.75]
79
- })
80
-
81
- jobs_df = load_dataset("Updated_Job_Posting_Dataset.csv", ["job_title", "company_name", "location", "required_skills", "job_description"], {
82
- 'job_title': ['DevOps Engineer', 'Cloud Architect', 'Software Engineer', 'Data Scientist', 'Security Analyst'],
83
- 'company_name': ['Tech Corp', 'Cloud Inc', 'Tech Solutions', 'Data Co', 'SecuriTech'],
84
- 'location': ['Remote', 'Islamabad', 'Karachi', 'Remote', 'Islamabad'],
85
- 'required_skills': ['Linux, Kubernetes', 'AWS, Kubernetes', 'Python, Node.js', 'Python, SQL', 'Cybersecurity, Linux'],
86
- 'job_description': ['DevOps role description', 'Cloud architecture position', 'Software engineering role', 'Data science position', 'Security analyst role'],
87
- 'level': ['Intermediate', 'Advanced', 'Intermediate', 'Intermediate', 'Intermediate']
88
- })
89
-
90
- # Validate questions_df
91
- if questions_df is None or questions_df.empty:
92
- logger.error("questions_df is empty or could not be loaded. Exiting.")
93
- exit(1)
94
- if not all(col in questions_df.columns for col in ["Skill", "Question", "Answer"]):
95
- logger.error("questions_df is missing required columns. Exiting.")
96
- exit(1)
97
  logger.info(f"questions_df loaded with {len(questions_df)} rows. Skills available: {list(questions_df['Skill'].unique())}")
98
 
99
- # Load or Initialize Models
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  if os.path.exists(UNIVERSAL_MODEL_PATH):
101
  universal_model = SentenceTransformer(UNIVERSAL_MODEL_PATH)
102
  else:
103
- universal_model = SentenceTransformer("all-MiniLM-L6-v2")
 
104
 
105
  if os.path.exists(DETECTOR_MODEL_PATH):
106
  detector_tokenizer = AutoTokenizer.from_pretrained(DETECTOR_MODEL_PATH)
@@ -109,85 +88,72 @@ else:
109
  detector_tokenizer = AutoTokenizer.from_pretrained("roberta-base-openai-detector")
110
  detector_model = AutoModelForSequenceClassification.from_pretrained("roberta-base-openai-detector")
111
 
112
- # Global variables for cached embeddings
113
- question_embeddings = None
114
- course_embeddings = None
115
- job_skill_embeddings = None
116
- job_title_embeddings = None
117
-
118
- # Precompute Resources with Caching
119
- def initialize_resources(user_skills):
120
- global tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, question_embeddings, course_embeddings, job_skill_embeddings, job_title_embeddings
121
-
122
- user_skills_lower = [s.lower() for s in user_skills]
123
- needs_recompute = False
124
-
125
- # Check if cached resources exist
126
- required_paths = [TFIDF_PATH, SKILL_TFIDF_PATH, QUESTION_ANSWER_PATH, FAISS_INDEX_PATH,
127
- QUESTION_EMBEDDINGS_PATH, COURSE_EMBEDDINGS_PATH, JOB_SKILL_EMBEDDINGS_PATH, JOB_TITLE_EMBEDDINGS_PATH]
128
- if all(os.path.exists(p) for p in required_paths):
129
  try:
130
- with open(TFIDF_PATH, 'rb') as f:
131
- tfidf_vectorizer = pickle.load(f)
132
- with open(SKILL_TFIDF_PATH, 'rb') as f:
133
- skill_tfidf = pickle.load(f)
134
- with open(QUESTION_ANSWER_PATH, 'rb') as f:
135
- question_to_answer = pickle.load(f)
136
  faiss_index = faiss.read_index(FAISS_INDEX_PATH)
137
- with open(QUESTION_EMBEDDINGS_PATH, 'rb') as f:
138
- question_embeddings = pickle.load(f)
139
- with open(COURSE_EMBEDDINGS_PATH, 'rb') as f:
140
- course_embeddings = pickle.load(f)
141
- with open(JOB_SKILL_EMBEDDINGS_PATH, 'rb') as f:
142
- job_skill_embeddings = pickle.load(f)
143
- with open(JOB_TITLE_EMBEDDINGS_PATH, 'rb') as f:
144
- job_title_embeddings = pickle.load(f)
145
-
146
- if set(skill_tfidf.keys()) != set(user_skills_lower):
147
- logger.info("Skill mismatch detected, recomputing resources")
148
- needs_recompute = True
149
  except Exception as e:
150
- logger.error(f"Error loading saved resources: {e}")
151
- needs_recompute = True
152
  else:
153
- needs_recompute = True
154
-
155
- if needs_recompute:
156
- logger.info("Building new resources")
157
- tfidf_vectorizer = TfidfVectorizer(stop_words='english')
158
- all_texts = user_skills + questions_df['Answer'].fillna("").tolist() + questions_df['Question'].tolist()
159
- tfidf_vectorizer.fit(all_texts)
160
-
161
- skill_tfidf = {skill.lower(): tfidf_vectorizer.transform([skill]).toarray()[0] for skill in user_skills}
162
- question_to_answer = dict(zip(questions_df['Question'], questions_df['Answer']))
163
- answers = list(question_to_answer.values())
164
- question_embeddings = universal_model.encode(answers, batch_size=128, convert_to_tensor=True).cpu().numpy()
165
-
166
- faiss_index = faiss.IndexFlatL2(question_embeddings.shape[1])
167
- faiss_index.add(question_embeddings)
168
-
169
- # Precompute embeddings for courses and jobs
170
- course_embeddings = universal_model.encode(courses_df['skills'].fillna("").tolist(), batch_size=128, convert_to_tensor=True).cpu().numpy()
171
- job_field = 'required_skills' if 'required_skills' in jobs_df.columns else 'job_description'
172
- job_skill_embeddings = universal_model.encode(jobs_df[job_field].fillna("").tolist(), batch_size=128, convert_to_tensor=True).cpu().numpy()
173
- job_title_embeddings = universal_model.encode(jobs_df['job_title'].fillna("").tolist(), batch_size=128, convert_to_tensor=True).cpu().numpy()
174
-
175
- # Save resources
176
- with open(TFIDF_PATH, 'wb') as f: pickle.dump(tfidf_vectorizer, f)
177
- with open(SKILL_TFIDF_PATH, 'wb') as f: pickle.dump(skill_tfidf, f)
178
- with open(QUESTION_ANSWER_PATH, 'wb') as f: pickle.dump(question_to_answer, f)
179
- faiss.write_index(faiss_index, FAISS_INDEX_PATH)
180
- with open(QUESTION_EMBEDDINGS_PATH, 'wb') as f: pickle.dump(question_embeddings, f)
181
- with open(COURSE_EMBEDDINGS_PATH, 'wb') as f: pickle.dump(course_embeddings, f)
182
- with open(JOB_SKILL_EMBEDDINGS_PATH, 'wb') as f: pickle.dump(job_skill_embeddings, f)
183
- with open(JOB_TITLE_EMBEDDINGS_PATH, 'wb') as f: pickle.dump(job_title_embeddings, f)
184
- universal_model.save(UNIVERSAL_MODEL_PATH)
185
- logger.info(f"Resources saved to {chosen_model_dir}")
186
-
187
- # Enhanced evaluation with batch processing and parallelization
 
 
 
 
188
  def evaluate_response(args):
189
  try:
190
- skill, user_answer, expected_answer, user_answer_embedding, expected_answer_embedding = args
191
  if not user_answer:
192
  return skill, 0.0, False
193
 
@@ -197,7 +163,9 @@ def evaluate_response(args):
197
  probs = scipy.special.softmax(logits, axis=1).tolist()[0]
198
  is_ai = probs[1] > 0.5
199
 
200
- score = util.pytorch_cos_sim(user_answer_embedding, expected_answer_embedding).item() * 100
 
 
201
 
202
  user_tfidf = tfidf_vectorizer.transform([user_answer]).toarray()[0]
203
  skill_vec = skill_tfidf.get(skill.lower(), np.zeros_like(user_tfidf))
@@ -209,43 +177,40 @@ def evaluate_response(args):
209
  logger.error(f"Evaluation error for {skill}: {e}")
210
  return skill, 0.0, False
211
 
212
- # Improved course recommendation with cached embeddings
213
  def recommend_courses(skills_to_improve, user_level, upgrade=False):
214
  try:
215
  if not skills_to_improve or courses_df.empty:
216
  return []
217
 
218
- if 'popularity' not in courses_df:
219
- courses_df['popularity'] = 0.8
220
- if 'completion_rate' not in courses_df:
221
- courses_df['completion_rate'] = 0.7
222
-
223
- skill_embeddings = universal_model.encode(skills_to_improve, batch_size=128, convert_to_tensor=True)
224
- similarities = util.pytorch_cos_sim(skill_embeddings, torch.tensor(course_embeddings)).numpy()
225
 
226
- total_scores = 0.6 * similarities + 0.2 * courses_df['popularity'].values + 0.2 * courses_df['completion_rate'].values
 
227
 
228
- recommendations = []
229
  target_level = 'Advanced' if upgrade else user_level
230
- for i, skill in enumerate(skills_to_improve):
231
- idx = np.argsort(-total_scores[i])[:5]
232
- candidates = courses_df.iloc[idx]
233
- candidates = candidates[candidates['level'].str.contains(target_level, case=False, na=False)]
234
- recommendations.extend(candidates[['course_title', 'Organization']].values.tolist()[:3])
235
-
236
- return list(dict.fromkeys(map(tuple, recommendations)))
237
  except Exception as e:
238
  logger.error(f"Course recommendation error: {e}")
239
  return []
240
 
241
- # Enhanced job recommendation with cached embeddings
242
  def recommend_jobs(user_skills, user_level):
243
  try:
244
  if jobs_df.empty:
245
  return []
246
 
247
- user_embedding = universal_model.encode(" ".join(user_skills), batch_size=128, convert_to_tensor=True)
248
- skill_similarities = util.pytorch_cos_sim(user_embedding, torch.tensor(job_skill_embeddings)).numpy()[0]
 
 
 
 
249
 
250
  if 'level' not in jobs_df.columns:
251
  jobs_df['level'] = 'Intermediate'
@@ -255,9 +220,7 @@ def recommend_jobs(user_skills, user_level):
255
  level_scores = level_col.apply(lambda x: 1 - abs(level_map.get(x, 1) - user_level_num)/2)
256
 
257
  location_pref = jobs_df.get('location', pd.Series(['Remote'] * len(jobs_df))).apply(lambda x: 1.0 if x in ['Islamabad', 'Karachi'] else 0.7)
258
- industry_similarities = util.pytorch_cos_sim(user_embedding, torch.tensor(job_title_embeddings)).numpy()[0]
259
-
260
- total_job_scores = 0.5 * skill_similarities + 0.2 * level_scores + 0.1 * location_pref + 0.2 * industry_similarities
261
  top_job_indices = np.argsort(-total_job_scores)[:5]
262
 
263
  return [(jobs_df.iloc[i]['job_title'], jobs_df.iloc[i]['company_name'],
@@ -287,16 +250,11 @@ def assess_skills():
287
  if len(answers) != len(user_skills):
288
  return jsonify({"error": "Answers count must match skills count"}), 400
289
 
290
- initialize_resources(user_skills)
291
-
292
- # Get relevant questions
293
- filtered_questions = questions_df[questions_df['Skill'].str.lower().isin([skill.lower() for skill in user_skills])]
294
- if filtered_questions.empty:
295
- return jsonify({"error": "No matching questions found for the user's skills."}), 500
296
 
297
  user_questions = []
298
  for skill in user_skills:
299
- skill_questions = filtered_questions[filtered_questions['Skill'].str.lower() == skill.lower()]
300
  if not skill_questions.empty:
301
  user_questions.append(skill_questions.sample(1).iloc[0])
302
  else:
@@ -307,41 +265,21 @@ def assess_skills():
307
  })
308
  user_questions = pd.DataFrame(user_questions).reset_index(drop=True)
309
 
310
- if len(user_questions) != len(user_skills):
311
- return jsonify({"error": f"Internal error: Number of selected questions ({len(user_questions)}) does not match number of skills ({len(user_skills)})."}), 500
312
-
313
- # Batch encode all user answers and expected answers
314
- user_answers = []
315
- expected_answers = []
316
  user_responses = []
317
  for idx, row in user_questions.iterrows():
318
  answer = answers[idx]
319
  if not answer or answer.lower() == 'skip':
320
- user_responses.append((row['Skill'], None, None, None, None))
321
  else:
322
- user_answers.append(answer)
323
- expected_answer = question_to_answer.get(row['Question'], "")
324
- expected_answers.append(expected_answer)
325
- user_responses.append((row['Skill'], answer, expected_answer, None, None))
326
-
327
- if user_answers:
328
- all_embeddings = universal_model.encode(user_answers + expected_answers, batch_size=128, convert_to_tensor=True)
329
- user_answer_embeddings = all_embeddings[:len(user_answers)]
330
- expected_answer_embeddings = all_embeddings[len(user_answers):]
331
-
332
- for idx, (skill, answer, expected, _, _) in enumerate(user_responses):
333
- if answer:
334
- user_responses[idx] = (skill, answer, expected, user_answer_embeddings[idx], expected_answer_embeddings[idx])
335
-
336
- # Parallelize evaluation
337
- with Pool(processes=min(cpu_count(), 4)) as pool:
338
- eval_args = [response for response in user_responses if response[1]]
339
- results = pool.map(evaluate_response, eval_args)
340
 
341
  user_scores = {}
342
  ai_flags = {}
343
  scores_list = []
344
- skipped_questions = [f"{skill} ({question})" for skill, user_code, question, _, _ in user_responses if not user_code]
345
  for skill, score, is_ai in results:
346
  if skill in user_scores:
347
  user_scores[skill] = max(user_scores[skill], score)
@@ -355,7 +293,6 @@ def assess_skills():
355
  dynamic_threshold = max(40, mean_score)
356
  weak_skills = [skill for skill, score in user_scores.items() if score < dynamic_threshold]
357
 
358
- # Generate recommendations
359
  courses = recommend_courses(weak_skills or user_skills, user_level, upgrade=not weak_skills)
360
  jobs = recommend_jobs(user_skills, user_level)
361
 
 
7
  import pickle
8
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
9
  import scipy.special
 
10
  from sklearn.feature_extraction.text import TfidfVectorizer
 
11
  from flask import Flask, request, jsonify
12
  import logging
13
 
 
38
  SKILL_TFIDF_PATH = os.path.join(chosen_model_dir, "skill_tfidf.pkl")
39
  QUESTION_ANSWER_PATH = os.path.join(chosen_model_dir, "question_to_answer.pkl")
40
  FAISS_INDEX_PATH = os.path.join(chosen_model_dir, "faiss_index.index")
41
+ COURSE_SIMILARITY_PATH = os.path.join(chosen_model_dir, "course_similarity.pkl")
42
+ JOB_SIMILARITY_PATH = os.path.join(chosen_model_dir, "job_similarity.pkl")
43
+
44
+ # Load datasets with fallbacks (precomputed offline)
45
+ questions_df = pd.read_csv("Generated_Skill-Based_Questions.csv", usecols=["Skill", "Question", "Answer"])
46
+ if questions_df.empty:
47
+ questions_df = pd.DataFrame({
48
+ 'Skill': ['Linux', 'Git', 'Node.js', 'Python', 'Kubernetes'],
49
+ 'Question': ['Advanced Linux question', 'Advanced Git question', 'Basic Node.js question',
50
+ 'Intermediate Python question', 'Basic Kubernetes question'],
51
+ 'Answer': ['Linux answer', 'Git answer', 'Node.js answer', 'Python answer', 'Kubernetes answer']
52
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  logger.info(f"questions_df loaded with {len(questions_df)} rows. Skills available: {list(questions_df['Skill'].unique())}")
54
 
55
+ courses_df = pd.read_csv("coursera_course_dataset_v2_no_null.csv", usecols=["skills", "course_title", "Organization", "level"])
56
+ if courses_df.empty:
57
+ courses_df = pd.DataFrame({
58
+ 'skills': ['Linux', 'Git', 'Node.js', 'Python', 'Kubernetes'],
59
+ 'course_title': ['Linux Admin', 'Git Mastery', 'Node.js Advanced', 'Python for Data', 'Kubernetes Basics'],
60
+ 'Organization': ['Coursera', 'Udemy', 'Pluralsight', 'edX', 'Linux Foundation'],
61
+ 'level': ['Intermediate', 'Intermediate', 'Advanced', 'Advanced', 'Intermediate'],
62
+ 'popularity': [0.85, 0.9, 0.8, 0.95, 0.9],
63
+ 'completion_rate': [0.65, 0.7, 0.6, 0.8, 0.75]
64
+ })
65
+
66
+ jobs_df = pd.read_csv("Updated_Job_Posting_Dataset.csv", usecols=["job_title", "company_name", "location", "required_skills", "job_description"])
67
+ if jobs_df.empty:
68
+ jobs_df = pd.DataFrame({
69
+ 'job_title': ['DevOps Engineer', 'Cloud Architect', 'Software Engineer', 'Data Scientist', 'Security Analyst'],
70
+ 'company_name': ['Tech Corp', 'Cloud Inc', 'Tech Solutions', 'Data Co', 'SecuriTech'],
71
+ 'location': ['Remote', 'Islamabad', 'Karachi', 'Remote', 'Islamabad'],
72
+ 'required_skills': ['Linux, Kubernetes', 'AWS, Kubernetes', 'Python, Node.js', 'Python, SQL', 'Cybersecurity, Linux'],
73
+ 'job_description': ['DevOps role description', 'Cloud architecture position', 'Software engineering role', 'Data science position', 'Security analyst role'],
74
+ 'level': ['Intermediate', 'Advanced', 'Intermediate', 'Intermediate', 'Intermediate']
75
+ })
76
+
77
+ # Load or Initialize Models (lighter model)
78
  if os.path.exists(UNIVERSAL_MODEL_PATH):
79
  universal_model = SentenceTransformer(UNIVERSAL_MODEL_PATH)
80
  else:
81
+ universal_model = SentenceTransformer("all-MiniLM-L6-v2-distilled") # Lighter model
82
+ universal_model.save(UNIVERSAL_MODEL_PATH)
83
 
84
  if os.path.exists(DETECTOR_MODEL_PATH):
85
  detector_tokenizer = AutoTokenizer.from_pretrained(DETECTOR_MODEL_PATH)
 
88
  detector_tokenizer = AutoTokenizer.from_pretrained("roberta-base-openai-detector")
89
  detector_model = AutoModelForSequenceClassification.from_pretrained("roberta-base-openai-detector")
90
 
91
+ # Global variables for precomputed data
92
+ tfidf_vectorizer = None
93
+ skill_tfidf = None
94
+ question_to_answer = None
95
+ faiss_index = None
96
+ course_similarity = None
97
+ job_similarity = None
98
+
99
+ # Load Precomputed Resources
100
+ def load_precomputed_resources():
101
+ global tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, course_similarity, job_similarity
102
+ if all(os.path.exists(p) for p in [TFIDF_PATH, SKILL_TFIDF_PATH, QUESTION_ANSWER_PATH, FAISS_INDEX_PATH, COURSE_SIMILARITY_PATH, JOB_SIMILARITY_PATH]):
 
 
 
 
 
103
  try:
104
+ with open(TFIDF_PATH, 'rb') as f: tfidf_vectorizer = pickle.load(f)
105
+ with open(SKILL_TFIDF_PATH, 'rb') as f: skill_tfidf = pickle.load(f)
106
+ with open(QUESTION_ANSWER_PATH, 'rb') as f: question_to_answer = pickle.load(f)
 
 
 
107
  faiss_index = faiss.read_index(FAISS_INDEX_PATH)
108
+ with open(COURSE_SIMILARITY_PATH, 'rb') as f: course_similarity = pickle.load(f)
109
+ with open(JOB_SIMILARITY_PATH, 'rb') as f: job_similarity = pickle.load(f)
110
+ logger.info("Loaded precomputed resources successfully")
 
 
 
 
 
 
 
 
 
111
  except Exception as e:
112
+ logger.error(f"Error loading precomputed resources: {e}")
113
+ precompute_resources()
114
  else:
115
+ precompute_resources()
116
+
117
+ # Precompute Resources Offline (to be run separately)
118
+ def precompute_resources():
119
+ global tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, course_similarity, job_similarity
120
+ logger.info("Precomputing resources offline")
121
+ tfidf_vectorizer = TfidfVectorizer(stop_words='english')
122
+ all_texts = questions_df['Answer'].tolist() + questions_df['Question'].tolist()
123
+ tfidf_vectorizer.fit(all_texts)
124
+
125
+ skill_tfidf = {skill.lower(): tfidf_vectorizer.transform([skill]).toarray()[0] for skill in questions_df['Skill'].unique()}
126
+ question_to_answer = dict(zip(questions_df['Question'], questions_df['Answer']))
127
+ answer_embeddings = universal_model.encode(questions_df['Answer'].tolist(), batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu").cpu().numpy()
128
+
129
+ faiss_index = faiss.IndexFlatL2(answer_embeddings.shape[1])
130
+ faiss_index.add(answer_embeddings)
131
+
132
+ # Precompute course similarities
133
+ course_skills = courses_df['skills'].fillna("").tolist()
134
+ course_embeddings = universal_model.encode(course_skills, batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu").cpu().numpy()
135
+ skill_embeddings = universal_model.encode(questions_df['Skill'].unique().tolist(), batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu")
136
+ course_similarity = util.pytorch_cos_sim(torch.tensor(skill_embeddings), torch.tensor(course_embeddings)).numpy()
137
+
138
+ # Precompute job similarities
139
+ job_skills = jobs_df['required_skills'].fillna("").tolist()
140
+ job_embeddings = universal_model.encode(job_skills, batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu").cpu().numpy()
141
+ job_similarity = util.pytorch_cos_sim(torch.tensor(skill_embeddings), torch.tensor(job_embeddings)).numpy()
142
+
143
+ # Save precomputed resources
144
+ with open(TFIDF_PATH, 'wb') as f: pickle.dump(tfidf_vectorizer, f)
145
+ with open(SKILL_TFIDF_PATH, 'wb') as f: pickle.dump(skill_tfidf, f)
146
+ with open(QUESTION_ANSWER_PATH, 'wb') as f: pickle.dump(question_to_answer, f)
147
+ faiss.write_index(faiss_index, FAISS_INDEX_PATH)
148
+ with open(COURSE_SIMILARITY_PATH, 'wb') as f: pickle.dump(course_similarity, f)
149
+ with open(JOB_SIMILARITY_PATH, 'wb') as f: pickle.dump(job_similarity, f)
150
+ universal_model.save(UNIVERSAL_MODEL_PATH)
151
+ logger.info(f"Precomputed resources saved to {chosen_model_dir}")
152
+
153
+ # Evaluation with precomputed data
154
  def evaluate_response(args):
155
  try:
156
+ skill, user_answer, question_idx = args
157
  if not user_answer:
158
  return skill, 0.0, False
159
 
 
163
  probs = scipy.special.softmax(logits, axis=1).tolist()[0]
164
  is_ai = probs[1] > 0.5
165
 
166
+ user_embedding = universal_model.encode([user_answer], batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu")[0]
167
+ expected_embedding = torch.tensor(answer_embeddings[question_idx])
168
+ score = util.pytorch_cos_sim(user_embedding, expected_embedding).item() * 100
169
 
170
  user_tfidf = tfidf_vectorizer.transform([user_answer]).toarray()[0]
171
  skill_vec = skill_tfidf.get(skill.lower(), np.zeros_like(user_tfidf))
 
177
  logger.error(f"Evaluation error for {skill}: {e}")
178
  return skill, 0.0, False
179
 
180
+ # Course recommendation with precomputed similarity
181
  def recommend_courses(skills_to_improve, user_level, upgrade=False):
182
  try:
183
  if not skills_to_improve or courses_df.empty:
184
  return []
185
 
186
+ skill_indices = [list(questions_df['Skill'].unique()).index(skill) for skill in skills_to_improve if skill in questions_df['Skill'].unique()]
187
+ if not skill_indices:
188
+ return []
 
 
 
 
189
 
190
+ similarities = course_similarity[skill_indices]
191
+ total_scores = 0.6 * np.max(similarities, axis=0) + 0.2 * courses_df.get('popularity', 0.8).values + 0.2 * courses_df.get('completion_rate', 0.7).values
192
 
 
193
  target_level = 'Advanced' if upgrade else user_level
194
+ idx = np.argsort(-total_scores)[:5]
195
+ candidates = courses_df.iloc[idx]
196
+ candidates = candidates[candidates['level'].str.contains(target_level, case=False, na=False)]
197
+ return candidates[['course_title', 'Organization']].values.tolist()[:3]
 
 
 
198
  except Exception as e:
199
  logger.error(f"Course recommendation error: {e}")
200
  return []
201
 
202
+ # Job recommendation with precomputed similarity
203
  def recommend_jobs(user_skills, user_level):
204
  try:
205
  if jobs_df.empty:
206
  return []
207
 
208
+ skill_indices = [list(questions_df['Skill'].unique()).index(skill) for skill in user_skills if skill in questions_df['Skill'].unique()]
209
+ if not skill_indices:
210
+ return []
211
+
212
+ similarities = job_similarity[skill_indices]
213
+ total_scores = 0.5 * np.max(similarities, axis=0)
214
 
215
  if 'level' not in jobs_df.columns:
216
  jobs_df['level'] = 'Intermediate'
 
220
  level_scores = level_col.apply(lambda x: 1 - abs(level_map.get(x, 1) - user_level_num)/2)
221
 
222
  location_pref = jobs_df.get('location', pd.Series(['Remote'] * len(jobs_df))).apply(lambda x: 1.0 if x in ['Islamabad', 'Karachi'] else 0.7)
223
+ total_job_scores = total_scores + 0.2 * level_scores + 0.1 * location_pref
 
 
224
  top_job_indices = np.argsort(-total_job_scores)[:5]
225
 
226
  return [(jobs_df.iloc[i]['job_title'], jobs_df.iloc[i]['company_name'],
 
250
  if len(answers) != len(user_skills):
251
  return jsonify({"error": "Answers count must match skills count"}), 400
252
 
253
+ load_precomputed_resources()
 
 
 
 
 
254
 
255
  user_questions = []
256
  for skill in user_skills:
257
+ skill_questions = questions_df[questions_df['Skill'] == skill]
258
  if not skill_questions.empty:
259
  user_questions.append(skill_questions.sample(1).iloc[0])
260
  else:
 
265
  })
266
  user_questions = pd.DataFrame(user_questions).reset_index(drop=True)
267
 
 
 
 
 
 
 
268
  user_responses = []
269
  for idx, row in user_questions.iterrows():
270
  answer = answers[idx]
271
  if not answer or answer.lower() == 'skip':
272
+ user_responses.append((row['Skill'], None, None))
273
  else:
274
+ question_idx = questions_df.index[questions_df['Question'] == row['Question']][0]
275
+ user_responses.append((row['Skill'], answer, question_idx))
276
+
277
+ results = [evaluate_response(response) for response in user_responses]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
 
279
  user_scores = {}
280
  ai_flags = {}
281
  scores_list = []
282
+ skipped_questions = [f"{skill} ({question})" for skill, user_code, _ in user_responses if not user_code]
283
  for skill, score, is_ai in results:
284
  if skill in user_scores:
285
  user_scores[skill] = max(user_scores[skill], score)
 
293
  dynamic_threshold = max(40, mean_score)
294
  weak_skills = [skill for skill, score in user_scores.items() if score < dynamic_threshold]
295
 
 
296
  courses = recommend_courses(weak_skills or user_skills, user_level, upgrade=not weak_skills)
297
  jobs = recommend_jobs(user_skills, user_level)
298