Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -7,9 +7,7 @@ import numpy as np
|
|
7 |
import pickle
|
8 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
9 |
import scipy.special
|
10 |
-
from tqdm import tqdm
|
11 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
12 |
-
from multiprocessing import Pool, cpu_count
|
13 |
from flask import Flask, request, jsonify
|
14 |
import logging
|
15 |
|
@@ -40,67 +38,48 @@ TFIDF_PATH = os.path.join(chosen_model_dir, "tfidf_vectorizer.pkl")
|
|
40 |
SKILL_TFIDF_PATH = os.path.join(chosen_model_dir, "skill_tfidf.pkl")
|
41 |
QUESTION_ANSWER_PATH = os.path.join(chosen_model_dir, "question_to_answer.pkl")
|
42 |
FAISS_INDEX_PATH = os.path.join(chosen_model_dir, "faiss_index.index")
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
df[col] = ""
|
56 |
-
return df
|
57 |
-
except Exception as e:
|
58 |
-
logger.error(f"Error loading {file_path}: {e}")
|
59 |
-
if fallback_data is not None:
|
60 |
-
logger.info(f"Using fallback data for {file_path}")
|
61 |
-
return pd.DataFrame(fallback_data)
|
62 |
-
return None
|
63 |
-
|
64 |
-
# Load datasets with fallbacks
|
65 |
-
questions_df = load_dataset("Generated_Skill-Based_Questions.csv", ["Skill", "Question", "Answer"], {
|
66 |
-
'Skill': ['Linux', 'Git', 'Node.js', 'Python', 'Kubernetes'],
|
67 |
-
'Question': ['Advanced Linux question', 'Advanced Git question', 'Basic Node.js question',
|
68 |
-
'Intermediate Python question', 'Basic Kubernetes question'],
|
69 |
-
'Answer': ['Linux answer', 'Git answer', 'Node.js answer', 'Python answer', 'Kubernetes answer']
|
70 |
-
})
|
71 |
-
|
72 |
-
courses_df = load_dataset("coursera_course_dataset_v2_no_null.csv", ["skills", "course_title", "Organization", "level"], {
|
73 |
-
'skills': ['Linux', 'Git', 'Node.js', 'Python', 'Kubernetes'],
|
74 |
-
'course_title': ['Linux Admin', 'Git Mastery', 'Node.js Advanced', 'Python for Data', 'Kubernetes Basics'],
|
75 |
-
'Organization': ['Coursera', 'Udemy', 'Pluralsight', 'edX', 'Linux Foundation'],
|
76 |
-
'level': ['Intermediate', 'Intermediate', 'Advanced', 'Advanced', 'Intermediate'],
|
77 |
-
'popularity': [0.85, 0.9, 0.8, 0.95, 0.9],
|
78 |
-
'completion_rate': [0.65, 0.7, 0.6, 0.8, 0.75]
|
79 |
-
})
|
80 |
-
|
81 |
-
jobs_df = load_dataset("Updated_Job_Posting_Dataset.csv", ["job_title", "company_name", "location", "required_skills", "job_description"], {
|
82 |
-
'job_title': ['DevOps Engineer', 'Cloud Architect', 'Software Engineer', 'Data Scientist', 'Security Analyst'],
|
83 |
-
'company_name': ['Tech Corp', 'Cloud Inc', 'Tech Solutions', 'Data Co', 'SecuriTech'],
|
84 |
-
'location': ['Remote', 'Islamabad', 'Karachi', 'Remote', 'Islamabad'],
|
85 |
-
'required_skills': ['Linux, Kubernetes', 'AWS, Kubernetes', 'Python, Node.js', 'Python, SQL', 'Cybersecurity, Linux'],
|
86 |
-
'job_description': ['DevOps role description', 'Cloud architecture position', 'Software engineering role', 'Data science position', 'Security analyst role'],
|
87 |
-
'level': ['Intermediate', 'Advanced', 'Intermediate', 'Intermediate', 'Intermediate']
|
88 |
-
})
|
89 |
-
|
90 |
-
# Validate questions_df
|
91 |
-
if questions_df is None or questions_df.empty:
|
92 |
-
logger.error("questions_df is empty or could not be loaded. Exiting.")
|
93 |
-
exit(1)
|
94 |
-
if not all(col in questions_df.columns for col in ["Skill", "Question", "Answer"]):
|
95 |
-
logger.error("questions_df is missing required columns. Exiting.")
|
96 |
-
exit(1)
|
97 |
logger.info(f"questions_df loaded with {len(questions_df)} rows. Skills available: {list(questions_df['Skill'].unique())}")
|
98 |
|
99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
if os.path.exists(UNIVERSAL_MODEL_PATH):
|
101 |
universal_model = SentenceTransformer(UNIVERSAL_MODEL_PATH)
|
102 |
else:
|
103 |
-
universal_model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
|
104 |
|
105 |
if os.path.exists(DETECTOR_MODEL_PATH):
|
106 |
detector_tokenizer = AutoTokenizer.from_pretrained(DETECTOR_MODEL_PATH)
|
@@ -109,85 +88,72 @@ else:
|
|
109 |
detector_tokenizer = AutoTokenizer.from_pretrained("roberta-base-openai-detector")
|
110 |
detector_model = AutoModelForSequenceClassification.from_pretrained("roberta-base-openai-detector")
|
111 |
|
112 |
-
# Global variables for
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
# Check if cached resources exist
|
126 |
-
required_paths = [TFIDF_PATH, SKILL_TFIDF_PATH, QUESTION_ANSWER_PATH, FAISS_INDEX_PATH,
|
127 |
-
QUESTION_EMBEDDINGS_PATH, COURSE_EMBEDDINGS_PATH, JOB_SKILL_EMBEDDINGS_PATH, JOB_TITLE_EMBEDDINGS_PATH]
|
128 |
-
if all(os.path.exists(p) for p in required_paths):
|
129 |
try:
|
130 |
-
with open(TFIDF_PATH, 'rb') as f:
|
131 |
-
|
132 |
-
with open(
|
133 |
-
skill_tfidf = pickle.load(f)
|
134 |
-
with open(QUESTION_ANSWER_PATH, 'rb') as f:
|
135 |
-
question_to_answer = pickle.load(f)
|
136 |
faiss_index = faiss.read_index(FAISS_INDEX_PATH)
|
137 |
-
with open(
|
138 |
-
|
139 |
-
|
140 |
-
course_embeddings = pickle.load(f)
|
141 |
-
with open(JOB_SKILL_EMBEDDINGS_PATH, 'rb') as f:
|
142 |
-
job_skill_embeddings = pickle.load(f)
|
143 |
-
with open(JOB_TITLE_EMBEDDINGS_PATH, 'rb') as f:
|
144 |
-
job_title_embeddings = pickle.load(f)
|
145 |
-
|
146 |
-
if set(skill_tfidf.keys()) != set(user_skills_lower):
|
147 |
-
logger.info("Skill mismatch detected, recomputing resources")
|
148 |
-
needs_recompute = True
|
149 |
except Exception as e:
|
150 |
-
logger.error(f"Error loading
|
151 |
-
|
152 |
else:
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
|
|
|
|
|
|
|
|
188 |
def evaluate_response(args):
|
189 |
try:
|
190 |
-
skill, user_answer,
|
191 |
if not user_answer:
|
192 |
return skill, 0.0, False
|
193 |
|
@@ -197,7 +163,9 @@ def evaluate_response(args):
|
|
197 |
probs = scipy.special.softmax(logits, axis=1).tolist()[0]
|
198 |
is_ai = probs[1] > 0.5
|
199 |
|
200 |
-
|
|
|
|
|
201 |
|
202 |
user_tfidf = tfidf_vectorizer.transform([user_answer]).toarray()[0]
|
203 |
skill_vec = skill_tfidf.get(skill.lower(), np.zeros_like(user_tfidf))
|
@@ -209,43 +177,40 @@ def evaluate_response(args):
|
|
209 |
logger.error(f"Evaluation error for {skill}: {e}")
|
210 |
return skill, 0.0, False
|
211 |
|
212 |
-
#
|
213 |
def recommend_courses(skills_to_improve, user_level, upgrade=False):
|
214 |
try:
|
215 |
if not skills_to_improve or courses_df.empty:
|
216 |
return []
|
217 |
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
courses_df['completion_rate'] = 0.7
|
222 |
-
|
223 |
-
skill_embeddings = universal_model.encode(skills_to_improve, batch_size=128, convert_to_tensor=True)
|
224 |
-
similarities = util.pytorch_cos_sim(skill_embeddings, torch.tensor(course_embeddings)).numpy()
|
225 |
|
226 |
-
|
|
|
227 |
|
228 |
-
recommendations = []
|
229 |
target_level = 'Advanced' if upgrade else user_level
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
recommendations.extend(candidates[['course_title', 'Organization']].values.tolist()[:3])
|
235 |
-
|
236 |
-
return list(dict.fromkeys(map(tuple, recommendations)))
|
237 |
except Exception as e:
|
238 |
logger.error(f"Course recommendation error: {e}")
|
239 |
return []
|
240 |
|
241 |
-
#
|
242 |
def recommend_jobs(user_skills, user_level):
|
243 |
try:
|
244 |
if jobs_df.empty:
|
245 |
return []
|
246 |
|
247 |
-
|
248 |
-
|
|
|
|
|
|
|
|
|
249 |
|
250 |
if 'level' not in jobs_df.columns:
|
251 |
jobs_df['level'] = 'Intermediate'
|
@@ -255,9 +220,7 @@ def recommend_jobs(user_skills, user_level):
|
|
255 |
level_scores = level_col.apply(lambda x: 1 - abs(level_map.get(x, 1) - user_level_num)/2)
|
256 |
|
257 |
location_pref = jobs_df.get('location', pd.Series(['Remote'] * len(jobs_df))).apply(lambda x: 1.0 if x in ['Islamabad', 'Karachi'] else 0.7)
|
258 |
-
|
259 |
-
|
260 |
-
total_job_scores = 0.5 * skill_similarities + 0.2 * level_scores + 0.1 * location_pref + 0.2 * industry_similarities
|
261 |
top_job_indices = np.argsort(-total_job_scores)[:5]
|
262 |
|
263 |
return [(jobs_df.iloc[i]['job_title'], jobs_df.iloc[i]['company_name'],
|
@@ -287,16 +250,11 @@ def assess_skills():
|
|
287 |
if len(answers) != len(user_skills):
|
288 |
return jsonify({"error": "Answers count must match skills count"}), 400
|
289 |
|
290 |
-
|
291 |
-
|
292 |
-
# Get relevant questions
|
293 |
-
filtered_questions = questions_df[questions_df['Skill'].str.lower().isin([skill.lower() for skill in user_skills])]
|
294 |
-
if filtered_questions.empty:
|
295 |
-
return jsonify({"error": "No matching questions found for the user's skills."}), 500
|
296 |
|
297 |
user_questions = []
|
298 |
for skill in user_skills:
|
299 |
-
skill_questions =
|
300 |
if not skill_questions.empty:
|
301 |
user_questions.append(skill_questions.sample(1).iloc[0])
|
302 |
else:
|
@@ -307,41 +265,21 @@ def assess_skills():
|
|
307 |
})
|
308 |
user_questions = pd.DataFrame(user_questions).reset_index(drop=True)
|
309 |
|
310 |
-
if len(user_questions) != len(user_skills):
|
311 |
-
return jsonify({"error": f"Internal error: Number of selected questions ({len(user_questions)}) does not match number of skills ({len(user_skills)})."}), 500
|
312 |
-
|
313 |
-
# Batch encode all user answers and expected answers
|
314 |
-
user_answers = []
|
315 |
-
expected_answers = []
|
316 |
user_responses = []
|
317 |
for idx, row in user_questions.iterrows():
|
318 |
answer = answers[idx]
|
319 |
if not answer or answer.lower() == 'skip':
|
320 |
-
user_responses.append((row['Skill'], None, None
|
321 |
else:
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
if user_answers:
|
328 |
-
all_embeddings = universal_model.encode(user_answers + expected_answers, batch_size=128, convert_to_tensor=True)
|
329 |
-
user_answer_embeddings = all_embeddings[:len(user_answers)]
|
330 |
-
expected_answer_embeddings = all_embeddings[len(user_answers):]
|
331 |
-
|
332 |
-
for idx, (skill, answer, expected, _, _) in enumerate(user_responses):
|
333 |
-
if answer:
|
334 |
-
user_responses[idx] = (skill, answer, expected, user_answer_embeddings[idx], expected_answer_embeddings[idx])
|
335 |
-
|
336 |
-
# Parallelize evaluation
|
337 |
-
with Pool(processes=min(cpu_count(), 4)) as pool:
|
338 |
-
eval_args = [response for response in user_responses if response[1]]
|
339 |
-
results = pool.map(evaluate_response, eval_args)
|
340 |
|
341 |
user_scores = {}
|
342 |
ai_flags = {}
|
343 |
scores_list = []
|
344 |
-
skipped_questions = [f"{skill} ({question})" for skill, user_code,
|
345 |
for skill, score, is_ai in results:
|
346 |
if skill in user_scores:
|
347 |
user_scores[skill] = max(user_scores[skill], score)
|
@@ -355,7 +293,6 @@ def assess_skills():
|
|
355 |
dynamic_threshold = max(40, mean_score)
|
356 |
weak_skills = [skill for skill, score in user_scores.items() if score < dynamic_threshold]
|
357 |
|
358 |
-
# Generate recommendations
|
359 |
courses = recommend_courses(weak_skills or user_skills, user_level, upgrade=not weak_skills)
|
360 |
jobs = recommend_jobs(user_skills, user_level)
|
361 |
|
|
|
7 |
import pickle
|
8 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
9 |
import scipy.special
|
|
|
10 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
11 |
from flask import Flask, request, jsonify
|
12 |
import logging
|
13 |
|
|
|
38 |
SKILL_TFIDF_PATH = os.path.join(chosen_model_dir, "skill_tfidf.pkl")
|
39 |
QUESTION_ANSWER_PATH = os.path.join(chosen_model_dir, "question_to_answer.pkl")
|
40 |
FAISS_INDEX_PATH = os.path.join(chosen_model_dir, "faiss_index.index")
|
41 |
+
COURSE_SIMILARITY_PATH = os.path.join(chosen_model_dir, "course_similarity.pkl")
|
42 |
+
JOB_SIMILARITY_PATH = os.path.join(chosen_model_dir, "job_similarity.pkl")
|
43 |
+
|
44 |
+
# Load datasets with fallbacks (precomputed offline)
|
45 |
+
questions_df = pd.read_csv("Generated_Skill-Based_Questions.csv", usecols=["Skill", "Question", "Answer"])
|
46 |
+
if questions_df.empty:
|
47 |
+
questions_df = pd.DataFrame({
|
48 |
+
'Skill': ['Linux', 'Git', 'Node.js', 'Python', 'Kubernetes'],
|
49 |
+
'Question': ['Advanced Linux question', 'Advanced Git question', 'Basic Node.js question',
|
50 |
+
'Intermediate Python question', 'Basic Kubernetes question'],
|
51 |
+
'Answer': ['Linux answer', 'Git answer', 'Node.js answer', 'Python answer', 'Kubernetes answer']
|
52 |
+
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
logger.info(f"questions_df loaded with {len(questions_df)} rows. Skills available: {list(questions_df['Skill'].unique())}")
|
54 |
|
55 |
+
courses_df = pd.read_csv("coursera_course_dataset_v2_no_null.csv", usecols=["skills", "course_title", "Organization", "level"])
|
56 |
+
if courses_df.empty:
|
57 |
+
courses_df = pd.DataFrame({
|
58 |
+
'skills': ['Linux', 'Git', 'Node.js', 'Python', 'Kubernetes'],
|
59 |
+
'course_title': ['Linux Admin', 'Git Mastery', 'Node.js Advanced', 'Python for Data', 'Kubernetes Basics'],
|
60 |
+
'Organization': ['Coursera', 'Udemy', 'Pluralsight', 'edX', 'Linux Foundation'],
|
61 |
+
'level': ['Intermediate', 'Intermediate', 'Advanced', 'Advanced', 'Intermediate'],
|
62 |
+
'popularity': [0.85, 0.9, 0.8, 0.95, 0.9],
|
63 |
+
'completion_rate': [0.65, 0.7, 0.6, 0.8, 0.75]
|
64 |
+
})
|
65 |
+
|
66 |
+
jobs_df = pd.read_csv("Updated_Job_Posting_Dataset.csv", usecols=["job_title", "company_name", "location", "required_skills", "job_description"])
|
67 |
+
if jobs_df.empty:
|
68 |
+
jobs_df = pd.DataFrame({
|
69 |
+
'job_title': ['DevOps Engineer', 'Cloud Architect', 'Software Engineer', 'Data Scientist', 'Security Analyst'],
|
70 |
+
'company_name': ['Tech Corp', 'Cloud Inc', 'Tech Solutions', 'Data Co', 'SecuriTech'],
|
71 |
+
'location': ['Remote', 'Islamabad', 'Karachi', 'Remote', 'Islamabad'],
|
72 |
+
'required_skills': ['Linux, Kubernetes', 'AWS, Kubernetes', 'Python, Node.js', 'Python, SQL', 'Cybersecurity, Linux'],
|
73 |
+
'job_description': ['DevOps role description', 'Cloud architecture position', 'Software engineering role', 'Data science position', 'Security analyst role'],
|
74 |
+
'level': ['Intermediate', 'Advanced', 'Intermediate', 'Intermediate', 'Intermediate']
|
75 |
+
})
|
76 |
+
|
77 |
+
# Load or Initialize Models (lighter model)
|
78 |
if os.path.exists(UNIVERSAL_MODEL_PATH):
|
79 |
universal_model = SentenceTransformer(UNIVERSAL_MODEL_PATH)
|
80 |
else:
|
81 |
+
universal_model = SentenceTransformer("all-MiniLM-L6-v2-distilled") # Lighter model
|
82 |
+
universal_model.save(UNIVERSAL_MODEL_PATH)
|
83 |
|
84 |
if os.path.exists(DETECTOR_MODEL_PATH):
|
85 |
detector_tokenizer = AutoTokenizer.from_pretrained(DETECTOR_MODEL_PATH)
|
|
|
88 |
detector_tokenizer = AutoTokenizer.from_pretrained("roberta-base-openai-detector")
|
89 |
detector_model = AutoModelForSequenceClassification.from_pretrained("roberta-base-openai-detector")
|
90 |
|
91 |
+
# Global variables for precomputed data
|
92 |
+
tfidf_vectorizer = None
|
93 |
+
skill_tfidf = None
|
94 |
+
question_to_answer = None
|
95 |
+
faiss_index = None
|
96 |
+
course_similarity = None
|
97 |
+
job_similarity = None
|
98 |
+
|
99 |
+
# Load Precomputed Resources
|
100 |
+
def load_precomputed_resources():
|
101 |
+
global tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, course_similarity, job_similarity
|
102 |
+
if all(os.path.exists(p) for p in [TFIDF_PATH, SKILL_TFIDF_PATH, QUESTION_ANSWER_PATH, FAISS_INDEX_PATH, COURSE_SIMILARITY_PATH, JOB_SIMILARITY_PATH]):
|
|
|
|
|
|
|
|
|
|
|
103 |
try:
|
104 |
+
with open(TFIDF_PATH, 'rb') as f: tfidf_vectorizer = pickle.load(f)
|
105 |
+
with open(SKILL_TFIDF_PATH, 'rb') as f: skill_tfidf = pickle.load(f)
|
106 |
+
with open(QUESTION_ANSWER_PATH, 'rb') as f: question_to_answer = pickle.load(f)
|
|
|
|
|
|
|
107 |
faiss_index = faiss.read_index(FAISS_INDEX_PATH)
|
108 |
+
with open(COURSE_SIMILARITY_PATH, 'rb') as f: course_similarity = pickle.load(f)
|
109 |
+
with open(JOB_SIMILARITY_PATH, 'rb') as f: job_similarity = pickle.load(f)
|
110 |
+
logger.info("Loaded precomputed resources successfully")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
except Exception as e:
|
112 |
+
logger.error(f"Error loading precomputed resources: {e}")
|
113 |
+
precompute_resources()
|
114 |
else:
|
115 |
+
precompute_resources()
|
116 |
+
|
117 |
+
# Precompute Resources Offline (to be run separately)
|
118 |
+
def precompute_resources():
|
119 |
+
global tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, course_similarity, job_similarity
|
120 |
+
logger.info("Precomputing resources offline")
|
121 |
+
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
|
122 |
+
all_texts = questions_df['Answer'].tolist() + questions_df['Question'].tolist()
|
123 |
+
tfidf_vectorizer.fit(all_texts)
|
124 |
+
|
125 |
+
skill_tfidf = {skill.lower(): tfidf_vectorizer.transform([skill]).toarray()[0] for skill in questions_df['Skill'].unique()}
|
126 |
+
question_to_answer = dict(zip(questions_df['Question'], questions_df['Answer']))
|
127 |
+
answer_embeddings = universal_model.encode(questions_df['Answer'].tolist(), batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu").cpu().numpy()
|
128 |
+
|
129 |
+
faiss_index = faiss.IndexFlatL2(answer_embeddings.shape[1])
|
130 |
+
faiss_index.add(answer_embeddings)
|
131 |
+
|
132 |
+
# Precompute course similarities
|
133 |
+
course_skills = courses_df['skills'].fillna("").tolist()
|
134 |
+
course_embeddings = universal_model.encode(course_skills, batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu").cpu().numpy()
|
135 |
+
skill_embeddings = universal_model.encode(questions_df['Skill'].unique().tolist(), batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu")
|
136 |
+
course_similarity = util.pytorch_cos_sim(torch.tensor(skill_embeddings), torch.tensor(course_embeddings)).numpy()
|
137 |
+
|
138 |
+
# Precompute job similarities
|
139 |
+
job_skills = jobs_df['required_skills'].fillna("").tolist()
|
140 |
+
job_embeddings = universal_model.encode(job_skills, batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu").cpu().numpy()
|
141 |
+
job_similarity = util.pytorch_cos_sim(torch.tensor(skill_embeddings), torch.tensor(job_embeddings)).numpy()
|
142 |
+
|
143 |
+
# Save precomputed resources
|
144 |
+
with open(TFIDF_PATH, 'wb') as f: pickle.dump(tfidf_vectorizer, f)
|
145 |
+
with open(SKILL_TFIDF_PATH, 'wb') as f: pickle.dump(skill_tfidf, f)
|
146 |
+
with open(QUESTION_ANSWER_PATH, 'wb') as f: pickle.dump(question_to_answer, f)
|
147 |
+
faiss.write_index(faiss_index, FAISS_INDEX_PATH)
|
148 |
+
with open(COURSE_SIMILARITY_PATH, 'wb') as f: pickle.dump(course_similarity, f)
|
149 |
+
with open(JOB_SIMILARITY_PATH, 'wb') as f: pickle.dump(job_similarity, f)
|
150 |
+
universal_model.save(UNIVERSAL_MODEL_PATH)
|
151 |
+
logger.info(f"Precomputed resources saved to {chosen_model_dir}")
|
152 |
+
|
153 |
+
# Evaluation with precomputed data
|
154 |
def evaluate_response(args):
|
155 |
try:
|
156 |
+
skill, user_answer, question_idx = args
|
157 |
if not user_answer:
|
158 |
return skill, 0.0, False
|
159 |
|
|
|
163 |
probs = scipy.special.softmax(logits, axis=1).tolist()[0]
|
164 |
is_ai = probs[1] > 0.5
|
165 |
|
166 |
+
user_embedding = universal_model.encode([user_answer], batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu")[0]
|
167 |
+
expected_embedding = torch.tensor(answer_embeddings[question_idx])
|
168 |
+
score = util.pytorch_cos_sim(user_embedding, expected_embedding).item() * 100
|
169 |
|
170 |
user_tfidf = tfidf_vectorizer.transform([user_answer]).toarray()[0]
|
171 |
skill_vec = skill_tfidf.get(skill.lower(), np.zeros_like(user_tfidf))
|
|
|
177 |
logger.error(f"Evaluation error for {skill}: {e}")
|
178 |
return skill, 0.0, False
|
179 |
|
180 |
+
# Course recommendation with precomputed similarity
|
181 |
def recommend_courses(skills_to_improve, user_level, upgrade=False):
|
182 |
try:
|
183 |
if not skills_to_improve or courses_df.empty:
|
184 |
return []
|
185 |
|
186 |
+
skill_indices = [list(questions_df['Skill'].unique()).index(skill) for skill in skills_to_improve if skill in questions_df['Skill'].unique()]
|
187 |
+
if not skill_indices:
|
188 |
+
return []
|
|
|
|
|
|
|
|
|
189 |
|
190 |
+
similarities = course_similarity[skill_indices]
|
191 |
+
total_scores = 0.6 * np.max(similarities, axis=0) + 0.2 * courses_df.get('popularity', 0.8).values + 0.2 * courses_df.get('completion_rate', 0.7).values
|
192 |
|
|
|
193 |
target_level = 'Advanced' if upgrade else user_level
|
194 |
+
idx = np.argsort(-total_scores)[:5]
|
195 |
+
candidates = courses_df.iloc[idx]
|
196 |
+
candidates = candidates[candidates['level'].str.contains(target_level, case=False, na=False)]
|
197 |
+
return candidates[['course_title', 'Organization']].values.tolist()[:3]
|
|
|
|
|
|
|
198 |
except Exception as e:
|
199 |
logger.error(f"Course recommendation error: {e}")
|
200 |
return []
|
201 |
|
202 |
+
# Job recommendation with precomputed similarity
|
203 |
def recommend_jobs(user_skills, user_level):
|
204 |
try:
|
205 |
if jobs_df.empty:
|
206 |
return []
|
207 |
|
208 |
+
skill_indices = [list(questions_df['Skill'].unique()).index(skill) for skill in user_skills if skill in questions_df['Skill'].unique()]
|
209 |
+
if not skill_indices:
|
210 |
+
return []
|
211 |
+
|
212 |
+
similarities = job_similarity[skill_indices]
|
213 |
+
total_scores = 0.5 * np.max(similarities, axis=0)
|
214 |
|
215 |
if 'level' not in jobs_df.columns:
|
216 |
jobs_df['level'] = 'Intermediate'
|
|
|
220 |
level_scores = level_col.apply(lambda x: 1 - abs(level_map.get(x, 1) - user_level_num)/2)
|
221 |
|
222 |
location_pref = jobs_df.get('location', pd.Series(['Remote'] * len(jobs_df))).apply(lambda x: 1.0 if x in ['Islamabad', 'Karachi'] else 0.7)
|
223 |
+
total_job_scores = total_scores + 0.2 * level_scores + 0.1 * location_pref
|
|
|
|
|
224 |
top_job_indices = np.argsort(-total_job_scores)[:5]
|
225 |
|
226 |
return [(jobs_df.iloc[i]['job_title'], jobs_df.iloc[i]['company_name'],
|
|
|
250 |
if len(answers) != len(user_skills):
|
251 |
return jsonify({"error": "Answers count must match skills count"}), 400
|
252 |
|
253 |
+
load_precomputed_resources()
|
|
|
|
|
|
|
|
|
|
|
254 |
|
255 |
user_questions = []
|
256 |
for skill in user_skills:
|
257 |
+
skill_questions = questions_df[questions_df['Skill'] == skill]
|
258 |
if not skill_questions.empty:
|
259 |
user_questions.append(skill_questions.sample(1).iloc[0])
|
260 |
else:
|
|
|
265 |
})
|
266 |
user_questions = pd.DataFrame(user_questions).reset_index(drop=True)
|
267 |
|
|
|
|
|
|
|
|
|
|
|
|
|
268 |
user_responses = []
|
269 |
for idx, row in user_questions.iterrows():
|
270 |
answer = answers[idx]
|
271 |
if not answer or answer.lower() == 'skip':
|
272 |
+
user_responses.append((row['Skill'], None, None))
|
273 |
else:
|
274 |
+
question_idx = questions_df.index[questions_df['Question'] == row['Question']][0]
|
275 |
+
user_responses.append((row['Skill'], answer, question_idx))
|
276 |
+
|
277 |
+
results = [evaluate_response(response) for response in user_responses]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
278 |
|
279 |
user_scores = {}
|
280 |
ai_flags = {}
|
281 |
scores_list = []
|
282 |
+
skipped_questions = [f"{skill} ({question})" for skill, user_code, _ in user_responses if not user_code]
|
283 |
for skill, score, is_ai in results:
|
284 |
if skill in user_scores:
|
285 |
user_scores[skill] = max(user_scores[skill], score)
|
|
|
293 |
dynamic_threshold = max(40, mean_score)
|
294 |
weak_skills = [skill for skill, score in user_scores.items() if score < dynamic_threshold]
|
295 |
|
|
|
296 |
courses = recommend_courses(weak_skills or user_skills, user_level, upgrade=not weak_skills)
|
297 |
jobs = recommend_jobs(user_skills, user_level)
|
298 |
|