Muhammad541 commited on
Commit
ceba453
·
verified ·
1 Parent(s): 2332fbf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +215 -289
app.py CHANGED
@@ -22,21 +22,18 @@ logger = logging.getLogger(__name__)
22
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
23
 
24
  # Paths for saving artifacts
25
- MODEL_DIR = "./saved_models" # Primary location in /app/saved_models
26
- FALLBACK_MODEL_DIR = "/tmp/saved_models" # Fallback if ./saved_models fails
27
 
28
- # Try to use the primary directory, fall back to /tmp if needed
29
  try:
30
  os.makedirs(MODEL_DIR, exist_ok=True)
31
- logger.info(f"Successfully created/accessed directory: {MODEL_DIR}")
32
  chosen_model_dir = MODEL_DIR
33
- except PermissionError as e:
34
- logger.warning(f"Permission denied creating directory {MODEL_DIR}: {e}. Falling back to {FALLBACK_MODEL_DIR}")
35
  os.makedirs(FALLBACK_MODEL_DIR, exist_ok=True)
36
  chosen_model_dir = FALLBACK_MODEL_DIR
37
- except Exception as e:
38
- logger.error(f"Unexpected error creating directory {MODEL_DIR}: {e}")
39
- raise
40
 
41
  # Update paths based on the chosen directory
42
  UNIVERSAL_MODEL_PATH = os.path.join(chosen_model_dir, "universal_model")
@@ -46,316 +43,245 @@ SKILL_TFIDF_PATH = os.path.join(chosen_model_dir, "skill_tfidf.pkl")
46
  QUESTION_ANSWER_PATH = os.path.join(chosen_model_dir, "question_to_answer.pkl")
47
  FAISS_INDEX_PATH = os.path.join(chosen_model_dir, "faiss_index.index")
48
 
49
- # Load Datasets
50
  def load_dataset(file_path, required_columns=[]):
51
  try:
52
  df = pd.read_csv(file_path)
53
  for col in required_columns:
54
  if col not in df.columns:
55
  logger.warning(f"Column '{col}' missing in {file_path}. Using default values.")
56
- df[col] = "" if col != 'level' else 'Intermediate'
57
  return df
58
- except FileNotFoundError:
59
- logger.error(f"Dataset not found at {file_path}. Exiting.")
60
  return None
61
 
62
- questions_df = load_dataset("Generated_Skill-Based_Questions.csv", ["Skill", "Question", "Answer"])
63
- courses_df = load_dataset("coursera_course_dataset_v2_no_null.csv", ["skills", "course_title", "Organization", "level"])
64
- jobs_df = load_dataset("Updated_Job_Posting_Dataset.csv", ["job_title", "company_name", "location", "required_skills", "job_description"])
65
-
66
- # Simulate courses_df with relevant skills
67
- if courses_df is None or 'skills' not in courses_df.columns or courses_df['skills'].str.strip().eq('').all():
68
- courses_df = pd.DataFrame({
69
- 'skills': ['Docker', 'Jenkins', 'Azure', 'Cybersecurity'],
70
- 'course_title': ['Docker Mastery', 'Jenkins CI/CD', 'Azure Fundamentals', 'Cybersecurity Basics'],
71
- 'Organization': ['Udemy', 'Coursera', 'Microsoft', 'edX'],
72
- 'level': ['Intermediate', 'Intermediate', 'Intermediate', 'Advanced'],
73
- 'popularity': [0.9, 0.85, 0.95, 0.8],
74
- 'completion_rate': [0.7, 0.65, 0.8, 0.6]
75
- })
76
-
77
- # Validate questions_df
78
- if questions_df is None or questions_df.empty:
79
- logger.error("questions_df is empty or could not be loaded. Exiting.")
80
- exit(1)
81
- if not all(col in questions_df.columns for col in ["Skill", "Question", "Answer"]):
82
- logger.error("questions_df is missing required columns. Exiting.")
83
- exit(1)
84
- logger.info(f"questions_df loaded with {len(questions_df)} rows. Skills available: {questions_df['Skill'].unique().tolist()}")
85
-
86
- # Load or Initialize Models
87
- if os.path.exists(UNIVERSAL_MODEL_PATH):
88
- universal_model = SentenceTransformer(UNIVERSAL_MODEL_PATH)
89
- else:
90
- universal_model = SentenceTransformer("all-MiniLM-L6-v2")
91
-
92
- if os.path.exists(DETECTOR_MODEL_PATH):
93
- detector_tokenizer = AutoTokenizer.from_pretrained(DETECTOR_MODEL_PATH)
94
- detector_model = AutoModelForSequenceClassification.from_pretrained(DETECTOR_MODEL_PATH)
95
- else:
96
- detector_tokenizer = AutoTokenizer.from_pretrained("roberta-base-openai-detector")
97
- detector_model = AutoModelForSequenceClassification.from_pretrained("roberta-base-openai-detector")
98
-
99
- # Precompute Resources with Validation
100
- def resources_valid(saved_skills, current_skills):
101
- return set(saved_skills) == set(current_skills)
102
 
 
 
 
 
 
103
  def initialize_resources(user_skills):
104
  global tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, answer_embeddings
105
- if (os.path.exists(TFIDF_PATH) and os.path.exists(SKILL_TFIDF_PATH) and
106
- os.path.exists(QUESTION_ANSWER_PATH) and os.path.exists(FAISS_INDEX_PATH)):
107
- with open(TFIDF_PATH, 'rb') as f:
108
- tfidf_vectorizer = pickle.load(f)
109
- with open(SKILL_TFIDF_PATH, 'rb') as f:
110
- skill_tfidf = pickle.load(f)
111
- with open(QUESTION_ANSWER_PATH, 'rb') as f:
112
- question_to_answer = pickle.load(f)
113
- faiss_index = faiss.read_index(FAISS_INDEX_PATH)
114
- answer_embeddings = universal_model.encode(list(question_to_answer.values()), convert_to_tensor=True, show_progress_bar=False).cpu().numpy()
115
-
116
- if not resources_valid(skill_tfidf.keys(), [s.lower() for s in user_skills]):
117
- logger.info("⚠ Saved skill TF-IDF mismatch detected. Recomputing resources.")
118
- tfidf_vectorizer = TfidfVectorizer(stop_words='english')
119
- all_texts = user_skills + questions_df['Answer'].fillna("").tolist() + questions_df['Question'].tolist()
120
- tfidf_vectorizer.fit(all_texts)
121
- skill_tfidf = {skill.lower(): tfidf_vectorizer.transform([skill.lower()]).toarray()[0] for skill in user_skills}
122
- question_to_answer = dict(zip(questions_df['Question'], questions_df['Answer']))
123
- answer_embeddings = universal_model.encode(list(question_to_answer.values()), convert_to_tensor=True, show_progress_bar=False).cpu().numpy()
124
- faiss_index = faiss.IndexFlatL2(answer_embeddings.shape[1])
125
- faiss_index.add(answer_embeddings)
126
  else:
 
 
 
 
127
  tfidf_vectorizer = TfidfVectorizer(stop_words='english')
128
  all_texts = user_skills + questions_df['Answer'].fillna("").tolist() + questions_df['Question'].tolist()
129
  tfidf_vectorizer.fit(all_texts)
130
- skill_tfidf = {skill.lower(): tfidf_vectorizer.transform([skill.lower()]).toarray()[0] for skill in user_skills}
 
131
  question_to_answer = dict(zip(questions_df['Question'], questions_df['Answer']))
132
- answer_embeddings = universal_model.encode(list(question_to_answer.values()), convert_to_tensor=True, show_progress_bar=False).cpu().numpy()
 
133
  faiss_index = faiss.IndexFlatL2(answer_embeddings.shape[1])
134
  faiss_index.add(answer_embeddings)
135
-
136
- with open(TFIDF_PATH, 'wb') as f:
137
- pickle.dump(tfidf_vectorizer, f)
138
- with open(SKILL_TFIDF_PATH, 'wb') as f:
139
- pickle.dump(skill_tfidf, f)
140
- with open(QUESTION_ANSWER_PATH, 'wb') as f:
141
- pickle.dump(question_to_answer, f)
142
  faiss.write_index(faiss_index, FAISS_INDEX_PATH)
143
- universal_model.save_pretrained(UNIVERSAL_MODEL_PATH)
144
- detector_model.save_pretrained(DETECTOR_MODEL_PATH)
145
- detector_tokenizer.save_pretrained(DETECTOR_MODEL_PATH)
146
- logger.info(f"Models and resources saved to {chosen_model_dir}")
147
 
148
- # Evaluate Responses
149
  def evaluate_response(args):
150
- skill, user_answer, question = args
151
- if not user_answer:
152
- return skill, 0, False
153
-
154
- inputs = detector_tokenizer(user_answer, return_tensors="pt", truncation=True, max_length=512)
155
- with torch.no_grad():
156
- logits = detector_model(**inputs).logits
157
- probs = scipy.special.softmax(logits, axis=1).tolist()[0]
158
- is_ai_generated = probs[1] > 0.5
159
-
160
- user_embedding = universal_model.encode(user_answer, convert_to_tensor=True)
161
- expected_answer = question_to_answer.get(question, "")
162
- expected_embedding = universal_model.encode(expected_answer, convert_to_tensor=True)
163
- score = util.pytorch_cos_sim(user_embedding, expected_embedding).item() * 100
164
-
165
- user_tfidf = tfidf_vectorizer.transform([user_answer]).toarray()[0]
166
- skill_lower = skill.lower()
167
- skill_vec = skill_tfidf.get(skill_lower, tfidf_vectorizer.transform([skill_lower]).toarray()[0])
168
- skill_relevance = np.dot(user_tfidf, skill_vec) / (np.linalg.norm(user_tfidf) * np.linalg.norm(skill_vec) + 1e-10)
169
- penalty = min(1.0, max(0.5, skill_relevance))
170
- score *= penalty
171
-
172
- logger.debug(f"Evaluated {skill}: score={score:.2f}, is_ai={is_ai_generated}")
173
- return skill, round(max(0, score), 2), is_ai_generated
174
-
175
- # Recommend Courses
 
176
  def recommend_courses(skills_to_improve, user_level, upgrade=False):
177
- if not skills_to_improve:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  return []
179
-
180
- skill_embeddings = universal_model.encode(skills_to_improve, convert_to_tensor=True)
181
- course_embeddings = universal_model.encode(courses_df['skills'].fillna(""), convert_to_tensor=True)
182
- bert_similarities = util.pytorch_cos_sim(skill_embeddings, course_embeddings).numpy()
183
-
184
- collab_scores = []
185
- for skill in skills_to_improve:
186
- overlap = 1 # Simplified since user_df is removed
187
- collab_scores.append(overlap)
188
- collab_similarities = np.array([collab_scores]).repeat(len(courses_df), axis=0).T
189
-
190
- popularity = courses_df['popularity'].fillna(0.5).to_numpy()
191
- completion = courses_df['completion_rate'].fillna(0.5).to_numpy()
192
- total_scores = (0.6 * bert_similarities + 0.2 * collab_similarities + 0.1 * popularity + 0.1 * completion)
193
-
194
- recommended_courses = []
195
- target_level = 'Advanced' if upgrade else user_level
196
- for i, skill in enumerate(skills_to_improve):
197
- top_indices = total_scores[i].argsort()[-5:][::-1]
198
- candidates = courses_df.iloc[top_indices]
199
- candidates = candidates[candidates['skills'].str.lower() == skill.lower()]
200
- if candidates.empty:
201
- candidates = courses_df.iloc[top_indices]
202
- candidates.loc[:, "level_match"] = candidates['level'].apply(lambda x: 1 if x == target_level else 0.8 if abs({'Beginner': 0, 'Intermediate': 1, 'Advanced': 2}[x] - {'Beginner': 0, 'Intermediate': 1, 'Advanced': 2}[user_level]) <= 1 else 0.5)
203
- level_filtered = candidates.sort_values(by="level_match", ascending=False)
204
- recommended_courses.extend(level_filtered[['course_title', 'Organization']].values.tolist()[:3])
205
- return list(dict.fromkeys(tuple(course) for course in recommended_courses if course[0].strip()))
206
-
207
- # Recommend Jobs
208
  def recommend_jobs(user_skills, user_level):
209
- job_field = 'required_skills' if 'required_skills' in jobs_df.columns and not jobs_df['required_skills'].str.strip().eq('').all() else 'job_description'
210
- job_embeddings = universal_model.encode(jobs_df[job_field].fillna(""), convert_to_tensor=True)
211
- user_embedding = universal_model.encode(" ".join(user_skills), convert_to_tensor=True)
212
- skill_similarities = util.pytorch_cos_sim(user_embedding, job_embeddings).numpy()[0]
213
-
214
- level_map = {'Beginner': 0, 'Intermediate': 1, 'Advanced': 2}
215
- user_level_num = level_map[user_level]
216
- exp_match = jobs_df['level'].fillna('Intermediate').apply(lambda x: 1 - abs(level_map.get(x, 1) - user_level_num) / 2) if 'level' in jobs_df.columns else np.ones(len(jobs_df)) * 0.5
217
- location_pref = jobs_df['location'].apply(lambda x: 1.0 if x in ['Islamabad', 'Karachi'] else 0.7).to_numpy()
218
- industry_embeddings = universal_model.encode(jobs_df['job_title'].fillna(""), convert_to_tensor=True)
219
- industry_similarities = util.pytorch_cos_sim(user_embedding, industry_embeddings).numpy()[0]
220
-
221
- total_job_scores = (0.5 * skill_similarities + 0.2 * exp_match + 0.1 * location_pref + 0.2 * industry_similarities)
222
- top_job_indices = total_job_scores.argsort()[-5:][::-1]
223
- return [(jobs_df.iloc[idx]['job_title'], jobs_df.iloc[idx]['company_name'], jobs_df.iloc[idx]['location']) for idx in top_job_indices]
224
-
225
- # Main API Endpoint
 
 
 
 
 
 
226
  app = Flask(__name__)
227
 
 
 
 
 
228
  @app.route('/assess', methods=['POST'])
229
  def assess_skills():
230
- data = request.get_json()
231
- logger.info(f"Received request: {data}")
232
-
233
- # Validate required fields
234
- if not data or 'user_name' not in data or 'skills' not in data or 'answers' not in data:
235
- logger.error("Invalid input: Missing 'user_name', 'skills', or 'answers' in JSON body.")
236
- return jsonify({"error": "Invalid input. Provide 'user_name', 'skills', and 'answers' in JSON body."}), 400
237
-
238
- user_name = data['user_name']
239
- user_skills = data['skills']
240
- answers = data['answers']
241
-
242
- # Validate inputs
243
- if not isinstance(user_name, str) or not user_name.strip():
244
- logger.error("Invalid user_name: Must be a non-empty string.")
245
- return jsonify({"error": "Invalid user_name. Must be a non-empty string."}), 400
246
-
247
- if not isinstance(user_skills, list) or not user_skills or not all(isinstance(skill, str) and skill.strip() for skill in user_skills):
248
- logger.error("Invalid skills: Must be a non-empty list of non-empty strings.")
249
- return jsonify({"error": "Invalid skills. Must be a non-empty list of non-empty strings."}), 400
250
-
251
- if not isinstance(answers, list):
252
- logger.error(f"Answers must be a list, got: {type(answers)}")
253
- return jsonify({"error": "Answers must be a list."}), 400
254
-
255
- # Ensure the number of answers matches the number of skills
256
- if len(answers) != len(user_skills):
257
- logger.error(f"Number of answers ({len(answers)}) does not match number of skills ({len(user_skills)}).")
258
- return jsonify({"error": f"Number of answers ({len(answers)}) must match the number of skills ({len(user_skills)})."}), 400
259
-
260
- user_level = 'Intermediate' # Default level since user_df is removed
261
- logger.info(f"User: {user_name}, Skills: {user_skills}, Level: {user_level}")
262
-
263
- initialize_resources(user_skills)
264
-
265
- # Normalize skills for case-insensitive matching
266
- filtered_questions = questions_df[questions_df['Skill'].str.lower().isin([skill.lower() for skill in user_skills])]
267
- logger.info(f"Filtered questions shape: {filtered_questions.shape}")
268
- logger.info(f"Available skills in questions_df: {filtered_questions['Skill'].unique().tolist()}")
269
- if filtered_questions.empty:
270
- logger.error("No matching questions found for the user's skills.")
271
- return jsonify({"error": "No matching questions found!"}), 500
272
-
273
- user_questions = []
274
- for skill in user_skills:
275
- skill_questions = filtered_questions[filtered_questions['Skill'].str.lower() == skill.lower()]
276
- logger.info(f"Questions for skill '{skill}': {len(skill_questions)}")
277
- if not skill_questions.empty:
278
- user_questions.append(skill_questions.sample(1).iloc[0])
279
- else:
280
- logger.warning(f"No questions found for skill '{skill}'. Using a default question.")
281
- user_questions.append({
282
- 'Skill': skill,
283
- 'Question': f"What are the best practices for using {skill} in a production environment?",
284
- 'Answer': f"Best practices for {skill} include proper documentation, monitoring, and security measures."
285
- })
286
- user_questions = pd.DataFrame(user_questions).reset_index(drop=True) # Reset index to ensure sequential indices
287
- logger.info(f"Selected questions: {user_questions[['Skill', 'Question']].to_dict(orient='records')}")
288
- logger.info(f"Number of selected questions: {len(user_questions)}")
289
-
290
- if len(user_questions) != len(user_skills):
291
- logger.error(f"Number of selected questions ({len(user_questions)}) does not match number of skills ({len(user_skills)}).")
292
- return jsonify({"error": f"Internal error: Number of selected questions ({len(user_questions)}) does not match number of skills ({len(user_skills)})."}), 500
293
-
294
- user_responses = []
295
- for idx, row in user_questions.iterrows():
296
- logger.debug(f"Pairing question for skill '{row['Skill']}' with answer at index {idx}")
297
- if idx >= len(answers):
298
- logger.error(f"Index out of range: idx={idx}, len(answers)={len(answers)}")
299
- return jsonify({"error": f"Internal error: Index {idx} out of range for answers list of length {len(answers)}."}), 500
300
- answer = answers[idx]
301
- if not answer or answer.lower() == 'skip':
302
- user_responses.append((row['Skill'], None, row['Question']))
303
- else:
304
- user_responses.append((row['Skill'], answer, row['Question']))
305
-
306
  try:
307
- with Pool(cpu_count()) as pool:
308
- eval_args = [(skill, user_code, question) for skill, user_code, question in user_responses if user_code]
309
- logger.info(f"Evaluating {len(eval_args)} answers using multiprocessing pool.")
310
- results = pool.map(evaluate_response, eval_args)
311
- logger.info(f"Evaluation results: {results}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
  except Exception as e:
313
- logger.error(f"Error in evaluate_response: {str(e)}", exc_info=True)
314
- return jsonify({"error": "Failed to evaluate answers due to an internal error."}), 500
315
-
316
- user_scores = {}
317
- ai_flags = {}
318
- scores_list = []
319
- skipped_questions = [f"{skill} ({question})" for skill, user_code, question in user_responses if user_code is None]
320
- for skill, score, is_ai in results:
321
- if skill in user_scores:
322
- user_scores[skill] = max(user_scores[skill], score)
323
- ai_flags[skill] = ai_flags[skill] or is_ai
324
- else:
325
- user_scores[skill] = score
326
- ai_flags[skill] = is_ai
327
- scores_list.append(score)
328
-
329
- mean_score = np.mean(scores_list) if scores_list else 50
330
- dynamic_threshold = max(40, mean_score)
331
- weak_skills = [skill for skill, score in user_scores.items() if score < dynamic_threshold]
332
-
333
- assessment_results = [
334
- (skill, f"{'■' * int(score//10)}{'-' * (10 - int(score//10))}", f"{score:.2f}%", "AI-Generated" if ai_flags[skill] else "Human-Written")
335
- for skill, score in user_scores.items()
336
- ]
337
- assessment_output = tabulate(assessment_results, headers=["Skill", "Progress", "Score", "Origin"], tablefmt="grid")
338
- if skipped_questions:
339
- assessment_output += f"\nSkipped Questions: {skipped_questions}"
340
- assessment_output += f"\nMean Score: {mean_score:.2f}, Dynamic Threshold: {dynamic_threshold:.2f}"
341
- assessment_output += f"\nWeak Skills: {weak_skills if weak_skills else 'None'}"
342
-
343
- skills_to_recommend = weak_skills if weak_skills else user_skills
344
- upgrade_flag = not weak_skills
345
- recommended_courses = recommend_courses(skills_to_recommend, user_level, upgrade=upgrade_flag)
346
- courses_output = tabulate(recommended_courses, headers=["Course", "Organization"], tablefmt="grid") if recommended_courses else "None"
347
-
348
- recommended_jobs = recommend_jobs(user_skills, user_level)
349
- jobs_output = tabulate(recommended_jobs, headers=["Job Title", "Company", "Location"], tablefmt="grid")
350
-
351
- response = {
352
- "user_info": f"User: {user_name}\nSkills: {user_skills}\nLevel: {user_level}",
353
- "assessment_results": assessment_output,
354
- "recommended_courses": courses_output,
355
- "recommended_jobs": jobs_output
356
- }
357
- logger.info(f"Response: {response}")
358
- return jsonify(response)
359
 
360
  if __name__ == '__main__':
361
- app.run(host='0.0.0.0', port=7860)
 
22
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
23
 
24
  # Paths for saving artifacts
25
+ MODEL_DIR = "./saved_models"
26
+ FALLBACK_MODEL_DIR = "/tmp/saved_models"
27
 
28
+ # Directory handling with improved error handling
29
  try:
30
  os.makedirs(MODEL_DIR, exist_ok=True)
31
+ logger.info(f"Using model directory: {MODEL_DIR}")
32
  chosen_model_dir = MODEL_DIR
33
+ except Exception as e:
34
+ logger.warning(f"Failed to create {MODEL_DIR}: {e}. Using fallback directory.")
35
  os.makedirs(FALLBACK_MODEL_DIR, exist_ok=True)
36
  chosen_model_dir = FALLBACK_MODEL_DIR
 
 
 
37
 
38
  # Update paths based on the chosen directory
39
  UNIVERSAL_MODEL_PATH = os.path.join(chosen_model_dir, "universal_model")
 
43
  QUESTION_ANSWER_PATH = os.path.join(chosen_model_dir, "question_to_answer.pkl")
44
  FAISS_INDEX_PATH = os.path.join(chosen_model_dir, "faiss_index.index")
45
 
46
+ # Improved dataset loading with fallback
47
  def load_dataset(file_path, required_columns=[]):
48
  try:
49
  df = pd.read_csv(file_path)
50
  for col in required_columns:
51
  if col not in df.columns:
52
  logger.warning(f"Column '{col}' missing in {file_path}. Using default values.")
53
+ df[col] = ""
54
  return df
55
+ except Exception as e:
56
+ logger.error(f"Error loading {file_path}: {e}")
57
  return None
58
 
59
+ # Load datasets with fallbacks
60
+ questions_df = load_dataset("Generated_Skill-Based_Questions.csv", ["Skill", "Question", "Answer"]) or pd.DataFrame({
61
+ 'Skill': ['Linux', 'Git', 'Node.js', 'Python', 'Kubernetes'],
62
+ 'Question': ['Advanced Linux question', 'Advanced Git question', 'Basic Node.js question',
63
+ 'Intermediate Python question', 'Basic Kubernetes question'],
64
+ 'Answer': ['Linux answer', 'Git answer', 'Node.js answer', 'Python answer', 'Kubernetes answer']
65
+ })
66
+
67
+ courses_df = load_dataset("coursera_course_dataset_v2_no_null.csv", ["skills", "course_title", "Organization", "level"]) or pd.DataFrame({
68
+ 'skills': ['Docker', 'Jenkins', 'Azure', 'Cybersecurity'],
69
+ 'course_title': ['Docker Mastery', 'Jenkins CI/CD', 'Azure Fundamentals', 'Cybersecurity Basics'],
70
+ 'Organization': ['Udemy', 'Coursera', 'Microsoft', 'edX'],
71
+ 'level': ['Intermediate', 'Intermediate', 'Intermediate', 'Advanced'],
72
+ 'popularity': [0.9, 0.85, 0.95, 0.8],
73
+ 'completion_rate': [0.7, 0.65, 0.8, 0.6]
74
+ })
75
+
76
+ jobs_df = load_dataset("Updated_Job_Posting_Dataset.csv", ["job_title", "company_name", "location", "required_skills", "job_description"]) or pd.DataFrame({
77
+ 'job_title': ['DevOps Engineer', 'Cloud Architect'],
78
+ 'company_name': ['Tech Corp', 'Cloud Inc'],
79
+ 'location': ['Remote', 'Silicon Valley'],
80
+ 'required_skills': ['Linux, Cloud', 'AWS, Kubernetes'],
81
+ 'job_description': ['DevOps role description', 'Cloud architecture position']
82
+ })
83
+
84
+ # Model loading with validation
85
+ def load_model(model_class, path, default_name):
86
+ try:
87
+ return model_class.from_pretrained(path)
88
+ except Exception as e:
89
+ logger.warning(f"Failed to load model from {path}: {e}. Using default {default_name}.")
90
+ return model_class.from_pretrained(default_name)
 
 
 
 
 
 
 
 
91
 
92
+ universal_model = SentenceTransformer(UNIVERSAL_MODEL_PATH) if os.path.exists(UNIVERSAL_MODEL_PATH) else SentenceTransformer("all-MiniLM-L6-v2")
93
+ detector_model = load_model(AutoModelForSequenceClassification, DETECTOR_MODEL_PATH, "roberta-base-openai-detector")
94
+ detector_tokenizer = AutoTokenizer.from_pretrained(DETECTOR_MODEL_PATH) if os.path.exists(DETECTOR_MODEL_PATH) else AutoTokenizer.from_pretrained("roberta-base-openai-detector")
95
+
96
+ # Enhanced resource initialization
97
  def initialize_resources(user_skills):
98
  global tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, answer_embeddings
99
+
100
+ user_skills_lower = [s.lower() for s in user_skills]
101
+ needs_recompute = False
102
+
103
+ if all(os.path.exists(p) for p in [TFIDF_PATH, SKILL_TFIDF_PATH, QUESTION_ANSWER_PATH, FAISS_INDEX_PATH]):
104
+ try:
105
+ with open(TFIDF_PATH, 'rb') as f:
106
+ tfidf_vectorizer = pickle.load(f)
107
+ with open(SKILL_TFIDF_PATH, 'rb') as f:
108
+ skill_tfidf = pickle.load(f)
109
+ with open(QUESTION_ANSWER_PATH, 'rb') as f:
110
+ question_to_answer = pickle.load(f)
111
+ faiss_index = faiss.read_index(FAISS_INDEX_PATH)
112
+
113
+ if set(skill_tfidf.keys()) != set(user_skills_lower):
114
+ logger.info("Skill mismatch detected, recomputing resources")
115
+ needs_recompute = True
116
+ except Exception as e:
117
+ logger.error(f"Error loading saved resources: {e}")
118
+ needs_recompute = True
 
119
  else:
120
+ needs_recompute = True
121
+
122
+ if needs_recompute:
123
+ logger.info("Building new resources")
124
  tfidf_vectorizer = TfidfVectorizer(stop_words='english')
125
  all_texts = user_skills + questions_df['Answer'].fillna("").tolist() + questions_df['Question'].tolist()
126
  tfidf_vectorizer.fit(all_texts)
127
+
128
+ skill_tfidf = {skill.lower(): tfidf_vectorizer.transform([skill]).toarray()[0] for skill in user_skills}
129
  question_to_answer = dict(zip(questions_df['Question'], questions_df['Answer']))
130
+ answer_embeddings = universal_model.encode(list(question_to_answer.values()), convert_to_tensor=True).cpu().numpy()
131
+
132
  faiss_index = faiss.IndexFlatL2(answer_embeddings.shape[1])
133
  faiss_index.add(answer_embeddings)
134
+
135
+ # Save resources
136
+ with open(TFIDF_PATH, 'wb') as f: pickle.dump(tfidf_vectorizer, f)
137
+ with open(SKILL_TFIDF_PATH, 'wb') as f: pickle.dump(skill_tfidf, f)
138
+ with open(QUESTION_ANSWER_PATH, 'wb') as f: pickle.dump(question_to_answer, f)
 
 
139
  faiss.write_index(faiss_index, FAISS_INDEX_PATH)
140
+ universal_model.save(UNIVERSAL_MODEL_PATH)
141
+ logger.info(f"Resources saved to {chosen_model_dir}")
 
 
142
 
143
+ # Enhanced evaluation with error handling
144
  def evaluate_response(args):
145
+ try:
146
+ skill, user_answer, question = args
147
+ if not user_answer:
148
+ return skill, 0.0, False
149
+
150
+ inputs = detector_tokenizer(user_answer, return_tensors="pt", truncation=True, max_length=512)
151
+ with torch.no_grad():
152
+ logits = detector_model(**inputs).logits
153
+ probs = scipy.special.softmax(logits, axis=1).tolist()[0]
154
+ is_ai = probs[1] > 0.5
155
+
156
+ expected_answer = question_to_answer.get(question, "")
157
+ user_embedding = universal_model.encode(user_answer, convert_to_tensor=True)
158
+ expected_embedding = universal_model.encode(expected_answer, convert_to_tensor=True)
159
+ score = util.pytorch_cos_sim(user_embedding, expected_embedding).item() * 100
160
+
161
+ user_tfidf = tfidf_vectorizer.transform([user_answer]).toarray()[0]
162
+ skill_vec = skill_tfidf.get(skill.lower(), np.zeros_like(user_tfidf))
163
+ relevance = np.dot(user_tfidf, skill_vec) / (np.linalg.norm(user_tfidf) * np.linalg.norm(skill_vec) + 1e-10)
164
+ score *= max(0.5, min(1.0, relevance))
165
+
166
+ return skill, round(max(0, score), 2), is_ai
167
+ except Exception as e:
168
+ logger.error(f"Evaluation error for {skill}: {e}")
169
+ return skill, 0.0, False
170
+
171
+ # Improved course recommendation
172
  def recommend_courses(skills_to_improve, user_level, upgrade=False):
173
+ try:
174
+ if not skills_to_improve or courses_df.empty:
175
+ return []
176
+
177
+ # Add missing columns if needed
178
+ if 'popularity' not in courses_df:
179
+ courses_df['popularity'] = 0.8
180
+ if 'completion_rate' not in courses_df:
181
+ courses_df['completion_rate'] = 0.7
182
+
183
+ skill_embeddings = universal_model.encode(skills_to_improve, convert_to_tensor=True)
184
+ course_embeddings = universal_model.encode(courses_df['skills'].fillna(""), convert_to_tensor=True)
185
+ similarities = util.pytorch_cos_sim(skill_embeddings, course_embeddings).numpy()
186
+
187
+ total_scores = 0.6 * similarities + 0.2 * courses_df['popularity'].values + 0.2 * courses_df['completion_rate'].values
188
+
189
+ recommendations = []
190
+ target_level = 'Advanced' if upgrade else user_level
191
+ for i, skill in enumerate(skills_to_improve):
192
+ idx = np.argsort(-total_scores[i])[:5]
193
+ candidates = courses_df.iloc[idx]
194
+ candidates = candidates[candidates['level'].str.contains(target_level, case=False)]
195
+ recommendations.extend(candidates[['course_title', 'Organization']].values.tolist()[:3])
196
+
197
+ return list(dict.fromkeys(map(tuple, recommendations)))
198
+ except Exception as e:
199
+ logger.error(f"Course recommendation error: {e}")
200
  return []
201
+
202
+ # Enhanced job recommendation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  def recommend_jobs(user_skills, user_level):
204
+ try:
205
+ if jobs_df.empty:
206
+ return []
207
+
208
+ job_field = 'required_skills' if 'required_skills' in jobs_df.columns else 'job_description'
209
+ job_embeddings = universal_model.encode(jobs_df[job_field].fillna(""), convert_to_tensor=True)
210
+ user_embedding = universal_model.encode(" ".join(user_skills), convert_to_tensor=True)
211
+ similarities = util.pytorch_cos_sim(user_embedding, job_embeddings).numpy()[0]
212
+
213
+ level_scores = jobs_df.get('level', 'Intermediate').apply(
214
+ lambda x: 1 - abs({'Beginner':0, 'Intermediate':1, 'Advanced':2}.get(x,1) -
215
+ {'Beginner':0, 'Intermediate':1, 'Advanced':2}[user_level])/2
216
+ )
217
+ total_scores = 0.6 * similarities + 0.4 * level_scores
218
+ top_idx = np.argsort(-total_scores)[:5]
219
+
220
+ return [(jobs_df.iloc[i]['job_title'], jobs_df.iloc[i]['company_name'],
221
+ jobs_df.iloc[i].get('location', 'Remote')) for i in top_idx]
222
+ except Exception as e:
223
+ logger.error(f"Job recommendation error: {e}")
224
+ return []
225
+
226
+ # Flask application setup
227
  app = Flask(__name__)
228
 
229
+ @app.route('/')
230
+ def health_check():
231
+ return jsonify({"status": "active", "model_dir": chosen_model_dir})
232
+
233
  @app.route('/assess', methods=['POST'])
234
  def assess_skills():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  try:
236
+ data = request.get_json()
237
+ if not data or 'skills' not in data or 'answers' not in data:
238
+ return jsonify({"error": "Missing required fields"}), 400
239
+
240
+ user_skills = [s.strip() for s in data['skills'] if isinstance(s, str)]
241
+ answers = [a.strip() for a in data['answers'] if isinstance(a, str)]
242
+ user_level = data.get('user_level', 'Intermediate').strip()
243
+
244
+ if len(answers) != len(user_skills):
245
+ return jsonify({"error": "Answers count must match skills count"}), 400
246
+
247
+ initialize_resources(user_skills)
248
+
249
+ # Get relevant questions
250
+ user_questions = questions_df[questions_df['Skill'].str.lower().isin([s.lower() for s in user_skills])]
251
+ if user_questions.empty:
252
+ user_questions = questions_df.sample(len(user_skills))
253
+
254
+ user_questions = user_questions.sample(len(user_skills)).reset_index(drop=True)
255
+ responses = list(zip(user_questions['Skill'], answers, user_questions['Question']))
256
+
257
+ # Parallel processing with error handling
258
+ with Pool(processes=min(cpu_count(), 4)) as pool:
259
+ results = pool.map(evaluate_response, responses)
260
+
261
+ # Process results
262
+ assessment = []
263
+ scores = []
264
+ for skill, score, is_ai in results:
265
+ assessment.append(f"{skill}: {score}% ({'AI' if is_ai else 'Human'})")
266
+ scores.append(score)
267
+
268
+ mean_score = np.mean(scores) if scores else 0
269
+ weak_skills = [skill for skill, score, _ in results if score < max(60, mean_score)]
270
+
271
+ # Generate recommendations
272
+ courses = recommend_courses(weak_skills or user_skills, user_level, upgrade=not weak_skills)
273
+ jobs = recommend_jobs(user_skills, user_level)
274
+
275
+ return jsonify({
276
+ "assessment": assessment,
277
+ "mean_score": round(mean_score, 1),
278
+ "weak_skills": weak_skills,
279
+ "courses": courses[:3], # Top 3 courses
280
+ "jobs": jobs[:5] # Top 5 jobs
281
+ })
282
  except Exception as e:
283
+ logger.error(f"Assessment error: {e}")
284
+ return jsonify({"error": "Internal server error"}), 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
 
286
  if __name__ == '__main__':
287
+ app.run(host='0.0.0.0', port=7860, threaded=True)