Muhammad541 commited on
Commit
db6e637
·
verified ·
1 Parent(s): 5a77e46

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +257 -104
app.py CHANGED
@@ -1,21 +1,21 @@
1
  import os
2
- import numpy as np
3
  import torch
4
  from sentence_transformers import SentenceTransformer, util
5
  import faiss
 
6
  import pickle
7
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
8
  import scipy.special
 
9
  from flask import Flask, request, jsonify
10
  import logging
11
- from pymongo import MongoClient
12
- import pandas as pd
13
 
14
  # Set up logging
15
  logging.basicConfig(level=logging.INFO)
16
  logger = logging.getLogger(__name__)
17
 
18
- # Disable tokenizers parallelism
19
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
20
 
21
  # Paths for saving artifacts
@@ -34,127 +34,282 @@ except Exception as e:
34
  # Update paths
35
  UNIVERSAL_MODEL_PATH = os.path.join(chosen_model_dir, "universal_model")
36
  DETECTOR_MODEL_PATH = os.path.join(chosen_model_dir, "detector_model")
 
 
 
37
  FAISS_INDEX_PATH = os.path.join(chosen_model_dir, "faiss_index.index")
38
  ANSWER_EMBEDDINGS_PATH = os.path.join(chosen_model_dir, "answer_embeddings.pkl")
39
- COURSE_EMBEDDINGS_PATH = os.path.join(chosen_model_dir, "course_embeddings.pkl")
40
- JOB_EMBEDDINGS_PATH = os.path.join(chosen_model_dir, "job_embeddings.pkl")
41
 
42
- # MongoDB connection (use the same URI as your Express app)
43
- MONGO_URI = "mongodb://localhost:27017/DMS" # Replace with your MongoDB URI
44
- client = MongoClient(MONGO_URI)
45
- db = client.get_database()
46
-
47
- # Load models
48
- universal_model = SentenceTransformer(UNIVERSAL_MODEL_PATH) if os.path.exists(UNIVERSAL_MODEL_PATH) else SentenceTransformer("all-MiniLM-L6-v2")
49
- detector_tokenizer = AutoTokenizer.from_pretrained(DETECTOR_MODEL_PATH) if os.path.exists(DETECTOR_MODEL_PATH) else AutoTokenizer.from_pretrained("roberta-base-openai-detector")
50
- detector_model = AutoModelForSequenceClassification.from_pretrained(DETECTOR_MODEL_PATH) if os.path.exists(DETECTOR_MODEL_PATH) else AutoModelForSequenceClassification.from_pretrained("roberta-base-openai-detector")
51
-
52
- # Global variables
53
  faiss_index = None
54
  answer_embeddings = None
55
- course_embeddings = None
56
- job_embeddings = None
57
 
58
- # Load data from MongoDB
59
- def load_mongodb_data():
60
- global answer_embeddings, course_embeddings, job_embeddings, faiss_index
61
  try:
62
- # Load questions from Generated_Skill-Based_Questions.csv (for now, keep as fallback; later, move to MongoDB)
63
- questions_df = pd.read_csv("Generated_Skill-Based_Questions.csv") # Replace with MongoDB query if stored
64
- courses = list(db.courses.find()) # Fetch all courses
65
- jobs = list(db.jobs.find()) # Fetch all jobs
66
-
67
- # Precompute embeddings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  answer_embeddings = universal_model.encode(questions_df['Answer'].tolist(), batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu").cpu().numpy()
69
- course_skills = [course['skills'] for course in courses] # Adjust based on your Course schema
70
- course_embeddings = universal_model.encode(course_skills, batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu").cpu().numpy()
71
- job_skills = [job['skills'] for job in jobs] # Adjust based on your Job schema
72
- job_embeddings = universal_model.encode(job_skills, batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu").cpu().numpy()
73
-
74
- # Build FAISS index
75
  faiss_index = faiss.IndexFlatL2(answer_embeddings.shape[1])
76
  faiss_index.add(answer_embeddings)
77
-
78
- # Save precomputed data
79
- with open(ANSWER_EMBEDDINGS_PATH, 'wb') as f: pickle.dump(answer_embeddings, f)
80
- with open(COURSE_EMBEDDINGS_PATH, 'wb') as f: pickle.dump(course_embeddings, f)
81
- with open(JOB_EMBEDDINGS_PATH, 'wb') as f: pickle.dump(job_embeddings, f)
 
 
 
 
 
 
 
 
 
 
 
82
  faiss.write_index(faiss_index, FAISS_INDEX_PATH)
83
- logger.info("Loaded and precomputed MongoDB data successfully")
 
 
 
 
84
  except Exception as e:
85
- logger.error(f"Error loading MongoDB data: {e}")
86
  raise
87
 
88
- # Evaluate response (unchanged logic, but use MongoDB questions if stored)
89
  def evaluate_response(args):
90
- skill, user_answer, question_idx = args
91
- if not user_answer:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  return skill, 0.0, False
93
 
94
- inputs = detector_tokenizer(user_answer, return_tensors="pt", truncation=True, max_length=512)
95
- with torch.no_grad():
96
- logits = detector_model(**inputs).logits
97
- probs = scipy.special.softmax(logits, axis=1).tolist()[0]
98
- is_ai = probs[1] > 0.5
99
-
100
- user_embedding = universal_model.encode([user_answer], batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu")[0]
101
- expected_embedding = torch.tensor(answer_embeddings[question_idx])
102
- score = util.pytorch_cos_sim(user_embedding, expected_embedding).item() * 100
103
- return skill, round(max(0, score), 2), is_ai
104
-
105
- # Recommend courses from MongoDB
106
  def recommend_courses(skills_to_improve, user_level, upgrade=False):
107
- if not skills_to_improve or not course_embeddings:
108
- return []
109
-
110
- skill_indices = [i for i, skill in enumerate(questions_df['Skill'].unique()) if skill in skills_to_improve]
111
- if not skill_indices:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  return []
113
 
114
- similarities = util.pytorch_cos_sim(
115
- torch.tensor(universal_model.encode(questions_df['Skill'].unique()[skill_indices].tolist(), batch_size=128)),
116
- torch.tensor(course_embeddings)
117
- ).cpu().numpy()
118
-
119
- courses = list(db.courses.find())
120
- popularity = [course.get('popularity', 0.8) for course in courses]
121
- completion_rate = [course.get('completion_rate', 0.7) for course in courses]
122
- total_scores = 0.6 * np.max(similarities, axis=0) + 0.2 * np.array(popularity) + 0.2 * np.array(completion_rate)
123
-
124
- target_level = 'Advanced' if upgrade else user_level
125
- idx = np.argsort(-total_scores)[:5]
126
- candidates = [courses[i] for i in idx]
127
- filtered_candidates = [c for c in candidates if target_level.lower() in c.get('level', 'Intermediate').lower()]
128
- return filtered_candidates[:3] if filtered_candidates else candidates[:3]
129
-
130
- # Recommend jobs from MongoDB
131
  def recommend_jobs(user_skills, user_level):
132
- if not job_embeddings:
133
- return []
134
-
135
- skill_indices = [i for i, skill in enumerate(questions_df['Skill'].unique()) if skill in user_skills]
136
- if not skill_indices:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  return []
138
 
139
- similarities = util.pytorch_cos_sim(
140
- torch.tensor(universal_model.encode(questions_df['Skill'].unique()[skill_indices].tolist(), batch_size=128)),
141
- torch.tensor(job_embeddings)
142
- ).cpu().numpy()
143
-
144
- jobs = list(db.jobs.find())
145
- level_map = {'Beginner': 0, 'Intermediate': 1, 'Advanced': 2}
146
- user_level_num = level_map.get(user_level, 1)
147
- level_scores = [1 - abs(level_map.get(job.get('level', 'Intermediate'), 1) - user_level_num) / 2 for job in jobs]
148
- location_pref = [1.0 if job.get('location', 'Remote') in ['Islamabad', 'Karachi'] else 0.7 for job in jobs]
149
- total_job_scores = 0.5 * np.max(similarities, axis=0) + 0.2 * np.array(level_scores) + 0.1 * np.array(location_pref)
150
-
151
- top_job_indices = np.argsort(-total_job_scores)[:5]
152
- return [(jobs[i]['jobTitle'], jobs[i]['companyName'], jobs[i].get('location', 'Remote')) for i in top_job_indices]
153
-
154
- # Flask app setup
155
  app = Flask(__name__)
156
 
157
- @app.route('/health')
158
  def health_check():
159
  return jsonify({"status": "active", "model_dir": chosen_model_dir})
160
 
@@ -172,10 +327,8 @@ def assess_skills():
172
  if len(answers) != len(user_skills):
173
  return jsonify({"error": "Answers count must match skills count"}), 400
174
 
175
- load_mongodb_data() # Load and precompute MongoDB data
176
 
177
- # Generate questions (for now, use CSV as fallback; move to MongoDB later)
178
- questions_df = pd.read_csv("Generated_Skill-Based_Questions.csv")
179
  user_questions = []
180
  for skill in user_skills:
181
  skill_questions = questions_df[questions_df['Skill'] == skill]
@@ -235,7 +388,7 @@ def assess_skills():
235
  "weak_skills": weak_skills,
236
  "skipped_questions": skipped_questions
237
  },
238
- "recommended_courses": [{"course_title": c['title'], "organization": c.get('organization', 'Unknown')} for c in courses],
239
  "recommended_jobs": jobs[:5]
240
  })
241
  except Exception as e:
 
1
  import os
2
+ import pandas as pd
3
  import torch
4
  from sentence_transformers import SentenceTransformer, util
5
  import faiss
6
+ import numpy as np
7
  import pickle
8
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
9
  import scipy.special
10
+ from sklearn.feature_extraction.text import TfidfVectorizer
11
  from flask import Flask, request, jsonify
12
  import logging
 
 
13
 
14
  # Set up logging
15
  logging.basicConfig(level=logging.INFO)
16
  logger = logging.getLogger(__name__)
17
 
18
+ # Disable tokenizers parallelism to avoid fork-related deadlocks
19
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
20
 
21
  # Paths for saving artifacts
 
34
  # Update paths
35
  UNIVERSAL_MODEL_PATH = os.path.join(chosen_model_dir, "universal_model")
36
  DETECTOR_MODEL_PATH = os.path.join(chosen_model_dir, "detector_model")
37
+ TFIDF_PATH = os.path.join(chosen_model_dir, "tfidf_vectorizer.pkl")
38
+ SKILL_TFIDF_PATH = os.path.join(chosen_model_dir, "skill_tfidf.pkl")
39
+ QUESTION_ANSWER_PATH = os.path.join(chosen_model_dir, "question_to_answer.pkl")
40
  FAISS_INDEX_PATH = os.path.join(chosen_model_dir, "faiss_index.index")
41
  ANSWER_EMBEDDINGS_PATH = os.path.join(chosen_model_dir, "answer_embeddings.pkl")
42
+ COURSE_SIMILARITY_PATH = os.path.join(chosen_model_dir, "course_similarity.pkl")
43
+ JOB_SIMILARITY_PATH = os.path.join(chosen_model_dir, "job_similarity.pkl")
44
 
45
+ # Global variables for precomputed data
46
+ tfidf_vectorizer = None
47
+ skill_tfidf = None
48
+ question_to_answer = None
 
 
 
 
 
 
 
49
  faiss_index = None
50
  answer_embeddings = None
51
+ course_similarity = None
52
+ job_similarity = None
53
 
54
+ # Improved dataset loading with fallback
55
+ def load_dataset(file_path, required_columns=[], additional_columns=['popularity', 'completion_rate'], fallback_data=None):
 
56
  try:
57
+ df = pd.read_csv(file_path)
58
+ missing_required = [col for col in required_columns if col not in df.columns]
59
+ missing_additional = [col for col in additional_columns if col not in df.columns]
60
+
61
+ # Handle missing required columns
62
+ if missing_required:
63
+ logger.warning(f"Required columns {missing_required} missing in {file_path}. Adding empty values.")
64
+ for col in missing_required:
65
+ df[col] = ""
66
+
67
+ # Handle missing additional columns (popularity, completion_rate, etc.)
68
+ if missing_additional:
69
+ logger.warning(f"Additional columns {missing_additional} missing in {file_path}. Adding default values.")
70
+ for col in missing_additional:
71
+ if col == 'popularity':
72
+ df[col] = 0.8 # Default value for popularity
73
+ elif col == 'completion_rate':
74
+ df[col] = 0.7 # Default value for completion_rate
75
+ else:
76
+ df[col] = 0.0 # Default for other additional columns
77
+
78
+ # Ensure 'level' column has valid values (not empty)
79
+ if 'level' in df.columns:
80
+ df['level'] = df['level'].apply(lambda x: 'Intermediate' if pd.isna(x) or x.strip() == "" else x)
81
+ else:
82
+ logger.warning(f"'level' column missing in {file_path}. Adding default 'Intermediate'.")
83
+ df['level'] = 'Intermediate'
84
+
85
+ return df
86
+ except ValueError as ve:
87
+ logger.error(f"ValueError loading {file_path}: {ve}. Using fallback data.")
88
+ if fallback_data is not None:
89
+ logger.info(f"Using fallback data for {file_path}")
90
+ return pd.DataFrame(fallback_data)
91
+ return None
92
+ except Exception as e:
93
+ logger.error(f"Error loading {file_path}: {e}. Using fallback data.")
94
+ if fallback_data is not None:
95
+ logger.info(f"Using fallback data for {file_path}")
96
+ return pd.DataFrame(fallback_data)
97
+ return None
98
+
99
+ # Load datasets with fallbacks
100
+ questions_df = load_dataset("Generated_Skill-Based_Questions.csv", ["Skill", "Question", "Answer"], [], {
101
+ 'Skill': ['Linux', 'Git', 'Node.js', 'Python', 'Kubernetes'],
102
+ 'Question': ['Advanced Linux question', 'Advanced Git question', 'Basic Node.js question',
103
+ 'Intermediate Python question', 'Basic Kubernetes question'],
104
+ 'Answer': ['Linux answer', 'Git answer', 'Node.js answer', 'Python answer', 'Kubernetes answer']
105
+ })
106
+
107
+ courses_df = load_dataset("coursera_course_dataset_v2_no_null.csv", ["skills", "course_title", "Organization", "level"], ['popularity', 'completion_rate'], {
108
+ 'skills': ['Linux', 'Git', 'Node.js', 'Python', 'Kubernetes'],
109
+ 'course_title': ['Linux Admin', 'Git Mastery', 'Node.js Advanced', 'Python for Data', 'Kubernetes Basics'],
110
+ 'Organization': ['Coursera', 'Udemy', 'Pluralsight', 'edX', 'Linux Foundation'],
111
+ 'level': ['Intermediate', 'Intermediate', 'Advanced', 'Advanced', 'Intermediate'],
112
+ 'popularity': [0.85, 0.9, 0.8, 0.95, 0.9],
113
+ 'completion_rate': [0.65, 0.7, 0.6, 0.8, 0.75]
114
+ })
115
+
116
+ jobs_df = load_dataset("Updated_Job_Posting_Dataset.csv", ["job_title", "company_name", "location", "required_skills", "job_description"], [], {
117
+ 'job_title': ['DevOps Engineer', 'Cloud Architect', 'Software Engineer', 'Data Scientist', 'Security Analyst'],
118
+ 'company_name': ['Tech Corp', 'Cloud Inc', 'Tech Solutions', 'Data Co', 'SecuriTech'],
119
+ 'location': ['Remote', 'Islamabad', 'Karachi', 'Remote', 'Islamabad'],
120
+ 'required_skills': ['Linux, Kubernetes', 'AWS, Kubernetes', 'Python, Node.js', 'Python, SQL', 'Cybersecurity, Linux'],
121
+ 'job_description': ['DevOps role description', 'Cloud architecture position', 'Software engineering role', 'Data science position', 'Security analyst role'],
122
+ 'level': ['Intermediate', 'Advanced', 'Intermediate', 'Intermediate', 'Intermediate']
123
+ })
124
+
125
+ # Validate questions_df
126
+ if questions_df is None or questions_df.empty:
127
+ logger.error("questions_df is empty or could not be loaded. Exiting.")
128
+ exit(1)
129
+ if not all(col in questions_df.columns for col in ["Skill", "Question", "Answer"]):
130
+ logger.error("questions_df is missing required columns. Exiting.")
131
+ exit(1)
132
+ logger.info(f"questions_df loaded with {len(questions_df)} rows. Skills available: {list(questions_df['Skill'].unique())}")
133
+
134
+ # Load or Initialize Models with Fallback
135
+ def load_universal_model():
136
+ default_model = "all-MiniLM-L6-v2"
137
+ try:
138
+ if os.path.exists(UNIVERSAL_MODEL_PATH):
139
+ logger.info(f"Loading universal model from {UNIVERSAL_MODEL_PATH}")
140
+ return SentenceTransformer(UNIVERSAL_MODEL_PATH)
141
+ else:
142
+ logger.info(f"Loading universal model: {default_model}")
143
+ model = SentenceTransformer(default_model)
144
+ model.save(UNIVERSAL_MODEL_PATH)
145
+ return model
146
+ except Exception as e:
147
+ logger.error(f"Failed to load universal model {default_model}: {e}. Exiting.")
148
+ exit(1)
149
+
150
+ universal_model = load_universal_model()
151
+
152
+ if os.path.exists(DETECTOR_MODEL_PATH):
153
+ detector_tokenizer = AutoTokenizer.from_pretrained(DETECTOR_MODEL_PATH)
154
+ detector_model = AutoModelForSequenceClassification.from_pretrained(DETECTOR_MODEL_PATH)
155
+ else:
156
+ detector_tokenizer = AutoTokenizer.from_pretrained("roberta-base-openai-detector")
157
+ detector_model = AutoModelForSequenceClassification.from_pretrained("roberta-base-openai-detector")
158
+
159
+ # Load Precomputed Resources
160
+ def load_precomputed_resources():
161
+ global tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, answer_embeddings, course_similarity, job_similarity
162
+ if all(os.path.exists(p) for p in [TFIDF_PATH, SKILL_TFIDF_PATH, QUESTION_ANSWER_PATH, FAISS_INDEX_PATH, ANSWER_EMBEDDINGS_PATH, COURSE_SIMILARITY_PATH, JOB_SIMILARITY_PATH]):
163
+ try:
164
+ with open(TFIDF_PATH, 'rb') as f: tfidf_vectorizer = pickle.load(f)
165
+ with open(SKILL_TFIDF_PATH, 'rb') as f: skill_tfidf = pickle.load(f)
166
+ with open(QUESTION_ANSWER_PATH, 'rb') as f: question_to_answer = pickle.load(f)
167
+ faiss_index = faiss.read_index(FAISS_INDEX_PATH)
168
+ with open(ANSWER_EMBEDDINGS_PATH, 'rb') as f: answer_embeddings = pickle.load(f)
169
+ with open(COURSE_SIMILARITY_PATH, 'rb') as f: course_similarity = pickle.load(f)
170
+ with open(JOB_SIMILARITY_PATH, 'rb') as f: job_similarity = pickle.load(f)
171
+ logger.info("Loaded precomputed resources successfully")
172
+ except Exception as e:
173
+ logger.error(f"Error loading precomputed resources: {e}")
174
+ precompute_resources()
175
+ else:
176
+ precompute_resources()
177
+
178
+ # Precompute Resources Offline (to be run separately)
179
+ def precompute_resources():
180
+ global tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, answer_embeddings, course_similarity, job_similarity
181
+ logger.info("Precomputing resources offline")
182
+ try:
183
+ tfidf_vectorizer = TfidfVectorizer(stop_words='english')
184
+ all_texts = questions_df['Answer'].tolist() + questions_df['Question'].tolist()
185
+ tfidf_vectorizer.fit(all_texts)
186
+
187
+ skill_tfidf = {skill.lower(): tfidf_vectorizer.transform([skill]).toarray()[0] for skill in questions_df['Skill'].unique()}
188
+ question_to_answer = dict(zip(questions_df['Question'], questions_df['Answer']))
189
  answer_embeddings = universal_model.encode(questions_df['Answer'].tolist(), batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu").cpu().numpy()
190
+
 
 
 
 
 
191
  faiss_index = faiss.IndexFlatL2(answer_embeddings.shape[1])
192
  faiss_index.add(answer_embeddings)
193
+
194
+ # Precompute course similarities
195
+ course_skills = courses_df['skills'].fillna("").tolist()
196
+ course_embeddings = universal_model.encode(course_skills, batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu")
197
+ skill_embeddings = universal_model.encode(questions_df['Skill'].unique().tolist(), batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu")
198
+ course_similarity = util.pytorch_cos_sim(skill_embeddings, course_embeddings).cpu().numpy()
199
+
200
+ # Precompute job similarities
201
+ job_skills = jobs_df['required_skills'].fillna("").tolist()
202
+ job_embeddings = universal_model.encode(job_skills, batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu")
203
+ job_similarity = util.pytorch_cos_sim(skill_embeddings, job_embeddings).cpu().numpy()
204
+
205
+ # Save precomputed resources
206
+ with open(TFIDF_PATH, 'wb') as f: pickle.dump(tfidf_vectorizer, f)
207
+ with open(SKILL_TFIDF_PATH, 'wb') as f: pickle.dump(skill_tfidf, f)
208
+ with open(QUESTION_ANSWER_PATH, 'wb') as f: pickle.dump(question_to_answer, f)
209
  faiss.write_index(faiss_index, FAISS_INDEX_PATH)
210
+ with open(ANSWER_EMBEDDINGS_PATH, 'wb') as f: pickle.dump(answer_embeddings, f)
211
+ with open(COURSE_SIMILARITY_PATH, 'wb') as f: pickle.dump(course_similarity, f)
212
+ with open(JOB_SIMILARITY_PATH, 'wb') as f: pickle.dump(job_similarity, f)
213
+ universal_model.save(UNIVERSAL_MODEL_PATH)
214
+ logger.info(f"Precomputed resources saved to {chosen_model_dir}")
215
  except Exception as e:
216
+ logger.error(f"Error during precomputation: {e}")
217
  raise
218
 
219
+ # Evaluation with precomputed data
220
  def evaluate_response(args):
221
+ try:
222
+ skill, user_answer, question_idx = args
223
+ if not user_answer:
224
+ return skill, 0.0, False
225
+
226
+ inputs = detector_tokenizer(user_answer, return_tensors="pt", truncation=True, max_length=512)
227
+ with torch.no_grad():
228
+ logits = detector_model(**inputs).logits
229
+ probs = scipy.special.softmax(logits, axis=1).tolist()[0]
230
+ is_ai = probs[1] > 0.5
231
+
232
+ user_embedding = universal_model.encode([user_answer], batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu")[0]
233
+ expected_embedding = torch.tensor(answer_embeddings[question_idx])
234
+ score = util.pytorch_cos_sim(user_embedding, expected_embedding).item() * 100
235
+
236
+ user_tfidf = tfidf_vectorizer.transform([user_answer]).toarray()[0]
237
+ skill_vec = skill_tfidf.get(skill.lower(), np.zeros_like(user_tfidf))
238
+ relevance = np.dot(user_tfidf, skill_vec) / (np.linalg.norm(user_tfidf) * np.linalg.norm(skill_vec) + 1e-10)
239
+ score *= max(0.5, min(1.0, relevance))
240
+
241
+ return skill, round(max(0, score), 2), is_ai
242
+ except Exception as e:
243
+ logger.error(f"Evaluation error for {skill}: {e}")
244
  return skill, 0.0, False
245
 
246
+ # Course recommendation with precomputed similarity
 
 
 
 
 
 
 
 
 
 
 
247
  def recommend_courses(skills_to_improve, user_level, upgrade=False):
248
+ try:
249
+ if not skills_to_improve or courses_df.empty:
250
+ logger.info("No skills to improve or courses_df is empty.")
251
+ return []
252
+
253
+ skill_indices = [list(questions_df['Skill'].unique()).index(skill) for skill in skills_to_improve if skill in questions_df['Skill'].unique()]
254
+ if not skill_indices:
255
+ logger.info("No matching skill indices found.")
256
+ return []
257
+
258
+ similarities = course_similarity[skill_indices]
259
+ # Use default arrays to avoid KeyError
260
+ popularity = courses_df['popularity'].values if 'popularity' in courses_df else np.full(len(courses_df), 0.8)
261
+ completion_rate = courses_df['completion_rate'].values if 'completion_rate' in courses_df else np.full(len(courses_df), 0.7)
262
+ total_scores = 0.6 * np.max(similarities, axis=0) + 0.2 * popularity + 0.2 * completion_rate
263
+
264
+ target_level = 'Advanced' if upgrade else user_level
265
+ idx = np.argsort(-total_scores)[:5]
266
+ candidates = courses_df.iloc[idx]
267
+
268
+ # Filter by level, but fallback to all courses if none match
269
+ filtered_candidates = candidates[candidates['level'].str.contains(target_level, case=False, na=False)]
270
+ if filtered_candidates.empty:
271
+ logger.warning(f"No courses found for level {target_level}. Returning top courses regardless of level.")
272
+ filtered_candidates = candidates
273
+
274
+ return filtered_candidates[['course_title', 'Organization']].values.tolist()[:3]
275
+ except Exception as e:
276
+ logger.error(f"Course recommendation error: {e}")
277
  return []
278
 
279
+ # Job recommendation with precomputed similarity
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
  def recommend_jobs(user_skills, user_level):
281
+ try:
282
+ if jobs_df.empty:
283
+ return []
284
+
285
+ skill_indices = [list(questions_df['Skill'].unique()).index(skill) for skill in user_skills if skill in questions_df['Skill'].unique()]
286
+ if not skill_indices:
287
+ return []
288
+
289
+ similarities = job_similarity[skill_indices]
290
+ total_scores = 0.5 * np.max(similarities, axis=0)
291
+
292
+ if 'level' not in jobs_df.columns:
293
+ jobs_df['level'] = 'Intermediate'
294
+ level_col = jobs_df['level'].astype(str)
295
+ level_map = {'Beginner': 0, 'Intermediate': 1, 'Advanced': 2}
296
+ user_level_num = level_map.get(user_level, 1)
297
+ level_scores = level_col.apply(lambda x: 1 - abs(level_map.get(x, 1) - user_level_num)/2)
298
+
299
+ location_pref = jobs_df.get('location', pd.Series(['Remote'] * len(jobs_df))).apply(lambda x: 1.0 if x in ['Islamabad', 'Karachi'] else 0.7)
300
+ total_job_scores = total_scores + 0.2 * level_scores + 0.1 * location_pref
301
+ top_job_indices = np.argsort(-total_job_scores)[:5]
302
+
303
+ return [(jobs_df.iloc[i]['job_title'], jobs_df.iloc[i]['company_name'],
304
+ jobs_df.iloc[i].get('location', 'Remote')) for i in top_job_indices]
305
+ except Exception as e:
306
+ logger.error(f"Job recommendation error: {e}")
307
  return []
308
 
309
+ # Flask application setup
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
  app = Flask(__name__)
311
 
312
+ @app.route('/')
313
  def health_check():
314
  return jsonify({"status": "active", "model_dir": chosen_model_dir})
315
 
 
327
  if len(answers) != len(user_skills):
328
  return jsonify({"error": "Answers count must match skills count"}), 400
329
 
330
+ load_precomputed_resources() # Load precomputed resources before processing
331
 
 
 
332
  user_questions = []
333
  for skill in user_skills:
334
  skill_questions = questions_df[questions_df['Skill'] == skill]
 
388
  "weak_skills": weak_skills,
389
  "skipped_questions": skipped_questions
390
  },
391
+ "recommended_courses": courses[:3],
392
  "recommended_jobs": jobs[:5]
393
  })
394
  except Exception as e: