Spaces:
Running
Running
import os | |
import pandas as pd | |
import torch | |
from sentence_transformers import SentenceTransformer, util | |
import faiss | |
import numpy as np | |
import pickle | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
import scipy.special | |
from tqdm import tqdm | |
from tabulate import tabulate | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from multiprocessing import Pool, cpu_count | |
from flask import Flask, request, jsonify | |
import logging | |
# Set up logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
# Disable tokenizers parallelism to avoid fork-related deadlocks | |
os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
# Paths for saving artifacts | |
MODEL_DIR = "./saved_models" # Primary location in /app/saved_models | |
FALLBACK_MODEL_DIR = "/tmp/saved_models" # Fallback if ./saved_models fails | |
# Try to use the primary directory, fall back to /tmp if needed | |
try: | |
os.makedirs(MODEL_DIR, exist_ok=True) | |
logger.info(f"Successfully created/accessed directory: {MODEL_DIR}") | |
chosen_model_dir = MODEL_DIR | |
except PermissionError as e: | |
logger.warning(f"Permission denied creating directory {MODEL_DIR}: {e}. Falling back to {FALLBACK_MODEL_DIR}") | |
os.makedirs(FALLBACK_MODEL_DIR, exist_ok=True) | |
chosen_model_dir = FALLBACK_MODEL_DIR | |
except Exception as e: | |
logger.error(f"Unexpected error creating directory {MODEL_DIR}: {e}") | |
raise | |
# Update paths based on the chosen directory | |
UNIVERSAL_MODEL_PATH = os.path.join(chosen_model_dir, "universal_model") | |
DETECTOR_MODEL_PATH = os.path.join(chosen_model_dir, "detector_model") | |
TFIDF_PATH = os.path.join(chosen_model_dir, "tfidf_vectorizer.pkl") | |
SKILL_TFIDF_PATH = os.path.join(chosen_model_dir, "skill_tfidf.pkl") | |
QUESTION_ANSWER_PATH = os.path.join(chosen_model_dir, "question_to_answer.pkl") | |
FAISS_INDEX_PATH = os.path.join(chosen_model_dir, "faiss_index.index") | |
# Load Datasets | |
def load_dataset(file_path, required_columns=[]): | |
try: | |
df = pd.read_csv(file_path) | |
for col in required_columns: | |
if col not in df.columns: | |
logger.warning(f"Column '{col}' missing in {file_path}. Using default values.") | |
df[col] = "" if col != 'level' else 'Intermediate' | |
return df | |
except FileNotFoundError: | |
logger.error(f"Dataset not found at {file_path}. Exiting.") | |
return None | |
user_df = load_dataset("Updated_User_Profile_Dataset.csv", ["name", "skills", "level"]) | |
questions_df = load_dataset("Generated_Skill-Based_Questions.csv", ["Skill", "Question", "Answer"]) | |
courses_df = load_dataset("coursera_course_dataset_v2_no_null.csv", ["skills", "course_title", "Organization", "level"]) | |
jobs_df = load_dataset("Updated_Job_Posting_Dataset.csv", ["job_title", "company_name", "location", "required_skills", "job_description"]) | |
# Simulate courses_df with relevant skills | |
if courses_df is None or 'skills' not in courses_df.columns or courses_df['skills'].str.strip().eq('').all(): | |
courses_df = pd.DataFrame({ | |
'skills': ['Docker', 'Jenkins', 'Azure', 'Cybersecurity'], | |
'course_title': ['Docker Mastery', 'Jenkins CI/CD', 'Azure Fundamentals', 'Cybersecurity Basics'], | |
'Organization': ['Udemy', 'Coursera', 'Microsoft', 'edX'], | |
'level': ['Intermediate', 'Intermediate', 'Intermediate', 'Advanced'], | |
'popularity': [0.9, 0.85, 0.95, 0.8], | |
'completion_rate': [0.7, 0.65, 0.8, 0.6] | |
}) | |
# Validate questions_df | |
if questions_df is None or questions_df.empty: | |
logger.error("questions_df is empty or could not be loaded. Exiting.") | |
exit(1) | |
if not all(col in questions_df.columns for col in ["Skill", "Question", "Answer"]): | |
logger.error("questions_df is missing required columns. Exiting.") | |
exit(1) | |
logger.info(f"questions_df loaded with {len(questions_df)} rows. Skills available: {questions_df['Skill'].unique().tolist()}") | |
# Load or Initialize Models | |
if os.path.exists(UNIVERSAL_MODEL_PATH): | |
universal_model = SentenceTransformer(UNIVERSAL_MODEL_PATH) | |
else: | |
universal_model = SentenceTransformer("all-MiniLM-L6-v2") | |
if os.path.exists(DETECTOR_MODEL_PATH): | |
detector_tokenizer = AutoTokenizer.from_pretrained(DETECTOR_MODEL_PATH) | |
detector_model = AutoModelForSequenceClassification.from_pretrained(DETECTOR_MODEL_PATH) | |
else: | |
detector_tokenizer = AutoTokenizer.from_pretrained("roberta-base-openai-detector") | |
detector_model = AutoModelForSequenceClassification.from_pretrained("roberta-base-openai-detector") | |
# Precompute Resources with Validation | |
def resources_valid(saved_skills, current_skills): | |
return set(saved_skills) == set(current_skills) | |
def initialize_resources(user_skills): | |
global tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, answer_embeddings | |
if (os.path.exists(TFIDF_PATH) and os.path.exists(SKILL_TFIDF_PATH) and | |
os.path.exists(QUESTION_ANSWER_PATH) and os.path.exists(FAISS_INDEX_PATH)): | |
with open(TFIDF_PATH, 'rb') as f: | |
tfidf_vectorizer = pickle.load(f) | |
with open(SKILL_TFIDF_PATH, 'rb') as f: | |
skill_tfidf = pickle.load(f) | |
with open(QUESTION_ANSWER_PATH, 'rb') as f: | |
question_to_answer = pickle.load(f) | |
faiss_index = faiss.read_index(FAISS_INDEX_PATH) | |
answer_embeddings = universal_model.encode(list(question_to_answer.values()), convert_to_tensor=True, show_progress_bar=False).cpu().numpy() | |
if not resources_valid(skill_tfidf.keys(), [s.lower() for s in user_skills]): | |
logger.info("⚠ Saved skill TF-IDF mismatch detected. Recomputing resources.") | |
tfidf_vectorizer = TfidfVectorizer(stop_words='english') | |
all_texts = user_skills + questions_df['Answer'].fillna("").tolist() + questions_df['Question'].tolist() | |
tfidf_vectorizer.fit(all_texts) | |
skill_tfidf = {skill.lower(): tfidf_vectorizer.transform([skill.lower()]).toarray()[0] for skill in user_skills} | |
question_to_answer = dict(zip(questions_df['Question'], questions_df['Answer'])) | |
answer_embeddings = universal_model.encode(list(question_to_answer.values()), convert_to_tensor=True, show_progress_bar=False).cpu().numpy() | |
faiss_index = faiss.IndexFlatL2(answer_embeddings.shape[1]) | |
faiss_index.add(answer_embeddings) | |
else: | |
tfidf_vectorizer = TfidfVectorizer(stop_words='english') | |
all_texts = user_skills + questions_df['Answer'].fillna("").tolist() + questions_df['Question'].tolist() | |
tfidf_vectorizer.fit(all_texts) | |
skill_tfidf = {skill.lower(): tfidf_vectorizer.transform([skill.lower()]).toarray()[0] for skill in user_skills} | |
question_to_answer = dict(zip(questions_df['Question'], questions_df['Answer'])) | |
answer_embeddings = universal_model.encode(list(question_to_answer.values()), convert_to_tensor=True, show_progress_bar=False).cpu().numpy() | |
faiss_index = faiss.IndexFlatL2(answer_embeddings.shape[1]) | |
faiss_index.add(answer_embeddings) | |
with open(TFIDF_PATH, 'wb') as f: | |
pickle.dump(tfidf_vectorizer, f) | |
with open(SKILL_TFIDF_PATH, 'wb') as f: | |
pickle.dump(skill_tfidf, f) | |
with open(QUESTION_ANSWER_PATH, 'wb') as f: | |
pickle.dump(question_to_answer, f) | |
faiss.write_index(faiss_index, FAISS_INDEX_PATH) | |
universal_model.save_pretrained(UNIVERSAL_MODEL_PATH) | |
detector_model.save_pretrained(DETECTOR_MODEL_PATH) | |
detector_tokenizer.save_pretrained(DETECTOR_MODEL_PATH) | |
logger.info(f"Models and resources saved to {chosen_model_dir}") | |
# Evaluate Responses | |
def evaluate_response(args): | |
skill, user_answer, question = args | |
if not user_answer: | |
return skill, 0, False | |
inputs = detector_tokenizer(user_answer, return_tensors="pt", truncation=True, max_length=512) | |
with torch.no_grad(): | |
logits = detector_model(**inputs).logits | |
probs = scipy.special.softmax(logits, axis=1).tolist()[0] | |
is_ai_generated = probs[1] > 0.5 | |
user_embedding = universal_model.encode(user_answer, convert_to_tensor=True) | |
expected_answer = question_to_answer.get(question, "") | |
expected_embedding = universal_model.encode(expected_answer, convert_to_tensor=True) | |
score = util.pytorch_cos_sim(user_embedding, expected_embedding).item() * 100 | |
user_tfidf = tfidf_vectorizer.transform([user_answer]).toarray()[0] | |
skill_lower = skill.lower() | |
skill_vec = skill_tfidf.get(skill_lower, tfidf_vectorizer.transform([skill_lower]).toarray()[0]) | |
skill_relevance = np.dot(user_tfidf, skill_vec) / (np.linalg.norm(user_tfidf) * np.linalg.norm(skill_vec) + 1e-10) | |
penalty = min(1.0, max(0.5, skill_relevance)) | |
score *= penalty | |
logger.debug(f"Evaluated {skill}: score={score:.2f}, is_ai={is_ai_generated}") | |
return skill, round(max(0, score), 2), is_ai_generated | |
# Recommend Courses | |
def recommend_courses(skills_to_improve, user_level, upgrade=False): | |
if not skills_to_improve: | |
return [] | |
skill_embeddings = universal_model.encode(skills_to_improve, convert_to_tensor=True) | |
course_embeddings = universal_model.encode(courses_df['skills'].fillna(""), convert_to_tensor=True) | |
bert_similarities = util.pytorch_cos_sim(skill_embeddings, course_embeddings).numpy() | |
collab_scores = [] | |
for skill in skills_to_improve: | |
overlap = sum(1 for user_skills_str in user_df['skills'] if pd.notna(user_skills_str) and skill.lower() in user_skills_str.lower()) | |
collab_scores.append(overlap / len(user_df)) | |
collab_similarities = np.array([collab_scores]).repeat(len(courses_df), axis=0).T | |
popularity = courses_df['popularity'].fillna(0.5).to_numpy() | |
completion = courses_df['completion_rate'].fillna(0.5).to_numpy() | |
total_scores = (0.6 * bert_similarities + 0.2 * collab_similarities + 0.1 * popularity + 0.1 * completion) | |
recommended_courses = [] | |
target_level = 'Advanced' if upgrade else user_level | |
for i, skill in enumerate(skills_to_improve): | |
top_indices = total_scores[i].argsort()[-5:][::-1] | |
candidates = courses_df.iloc[top_indices] | |
candidates = candidates[candidates['skills'].str.lower() == skill.lower()] | |
if candidates.empty: | |
candidates = courses_df.iloc[top_indices] | |
candidates.loc[:, "level_match"] = candidates['level'].apply(lambda x: 1 if x == target_level else 0.8 if abs({'Beginner': 0, 'Intermediate': 1, 'Advanced': 2}[x] - {'Beginner': 0, 'Intermediate': 1, 'Advanced': 2}[user_level]) <= 1 else 0.5) | |
level_filtered = candidates.sort_values(by="level_match", ascending=False) | |
recommended_courses.extend(level_filtered[['course_title', 'Organization']].values.tolist()[:3]) | |
return list(dict.fromkeys(tuple(course) for course in recommended_courses if course[0].strip())) | |
# Recommend Jobs | |
def recommend_jobs(user_skills, user_level): | |
job_field = 'required_skills' if 'required_skills' in jobs_df.columns and not jobs_df['required_skills'].str.strip().eq('').all() else 'job_description' | |
job_embeddings = universal_model.encode(jobs_df[job_field].fillna(""), convert_to_tensor=True) | |
user_embedding = universal_model.encode(" ".join(user_skills), convert_to_tensor=True) | |
skill_similarities = util.pytorch_cos_sim(user_embedding, job_embeddings).numpy()[0] | |
level_map = {'Beginner': 0, 'Intermediate': 1, 'Advanced': 2} | |
user_level_num = level_map[user_level] | |
exp_match = jobs_df['level'].fillna('Intermediate').apply(lambda x: 1 - abs(level_map.get(x, 1) - user_level_num) / 2) if 'level' in jobs_df.columns else np.ones(len(jobs_df)) * 0.5 | |
location_pref = jobs_df['location'].apply(lambda x: 1.0 if x in ['Islamabad', 'Karachi'] else 0.7).to_numpy() | |
industry_embeddings = universal_model.encode(jobs_df['job_title'].fillna(""), convert_to_tensor=True) | |
industry_similarities = util.pytorch_cos_sim(user_embedding, industry_embeddings).numpy()[0] | |
total_job_scores = (0.5 * skill_similarities + 0.2 * exp_match + 0.1 * location_pref + 0.2 * industry_similarities) | |
top_job_indices = total_job_scores.argsort()[-5:][::-1] | |
return [(jobs_df.iloc[idx]['job_title'], jobs_df.iloc[idx]['company_name'], jobs_df.iloc[idx]['location']) for idx in top_job_indices] | |
# Main API Endpoint | |
app = Flask(__name__) | |
def assess_skills(): | |
data = request.get_json() | |
logger.info(f"Received request: {data}") | |
if not data or 'user_index' not in data or 'answers' not in data: | |
logger.error("Invalid input: Missing 'user_index' or 'answers' in JSON body.") | |
return jsonify({"error": "Invalid input. Provide 'user_index' and 'answers' in JSON body."}), 400 | |
# Validate answers length immediately | |
answers = data['answers'] | |
if not isinstance(answers, list): | |
logger.error(f"Answers must be a list, got: {type(answers)}") | |
return jsonify({"error": "Answers must be a list."}), 400 | |
if len(answers) != 4: | |
logger.error(f"Expected exactly 4 answers, but received {len(answers)}.") | |
return jsonify({"error": f"Please provide exactly 4 answers. Received {len(answers)}."}), 400 | |
user_index = int(data['user_index']) | |
if user_index < 0 or user_index >= len(user_df): | |
logger.error(f"Invalid user index: {user_index}. Must be between 0 and {len(user_df) - 1}.") | |
return jsonify({"error": "Invalid user index."}), 400 | |
user_text = user_df.loc[user_index, 'skills'] | |
user_skills = [skill.strip() for skill in user_text.split(",") if skill.strip()] if isinstance(user_text, str) else ["Python", "SQL"] | |
user_name = user_df.loc[user_index, 'name'] | |
user_level = user_df.loc[user_index, 'level'] if 'level' in user_df.columns and pd.notna(user_df.loc[user_index, 'level']) else 'Intermediate' | |
logger.info(f"User: {user_name}, Skills: {user_skills}, Level: {user_level}") | |
initialize_resources(user_skills) | |
# Normalize skills for case-insensitive matching | |
filtered_questions = questions_df[questions_df['Skill'].str.lower().isin([skill.lower() for skill in user_skills])] | |
logger.info(f"Filtered questions shape: {filtered_questions.shape}") | |
logger.info(f"Available skills in questions_df: {filtered_questions['Skill'].unique().tolist()}") | |
if filtered_questions.empty: | |
logger.error("No matching questions found for the user's skills.") | |
return jsonify({"error": "No matching questions found!"}), 500 | |
user_questions = [] | |
for skill in user_skills: | |
skill_questions = filtered_questions[filtered_questions['Skill'].str.lower() == skill.lower()] | |
logger.info(f"Questions for skill '{skill}': {len(skill_questions)}") | |
if not skill_questions.empty: | |
user_questions.append(skill_questions.sample(1).iloc[0]) | |
else: | |
logger.warning(f"No questions found for skill '{skill}'. Using a default question.") | |
user_questions.append({ | |
'Skill': skill, | |
'Question': f"What are the best practices for using {skill} in a production environment?", | |
'Answer': f"Best practices for {skill} include proper documentation, monitoring, and security measures." | |
}) | |
user_questions = pd.DataFrame(user_questions).reset_index(drop=True) # Reset index to ensure sequential indices | |
logger.info(f"Selected questions: {user_questions[['Skill', 'Question']].to_dict(orient='records')}") | |
logger.info(f"Number of selected questions: {len(user_questions)}") | |
if len(user_questions) != 4: | |
logger.error(f"Not enough questions for all skills. Expected 4, got {len(user_questions)}.") | |
return jsonify({"error": f"Not enough questions for all skills! Expected 4, got {len(user_questions)}."}), 500 | |
user_responses = [] | |
for idx, row in user_questions.iterrows(): | |
logger.debug(f"Pairing question for skill '{row['Skill']}' with answer at index {idx}") | |
if idx >= len(answers): | |
logger.error(f"Index out of range: idx={idx}, len(answers)={len(answers)}") | |
return jsonify({"error": f"Internal error: Index {idx} out of range for answers list of length {len(answers)}."}), 500 | |
answer = answers[idx] | |
if not answer or answer.lower() == 'skip': | |
user_responses.append((row['Skill'], None, row['Question'])) | |
else: | |
user_responses.append((row['Skill'], answer, row['Question'])) | |
try: | |
with Pool(cpu_count()) as pool: | |
eval_args = [(skill, user_code, question) for skill, user_code, question in user_responses if user_code] | |
logger.info(f"Evaluating {len(eval_args)} answers using multiprocessing pool.") | |
results = pool.map(evaluate_response, eval_args) | |
logger.info(f"Evaluation results: {results}") | |
except Exception as e: | |
logger.error(f"Error in evaluate_response: {str(e)}", exc_info=True) | |
return jsonify({"error": "Failed to evaluate answers due to an internal error."}), 500 | |
user_scores = {} | |
ai_flags = {} | |
scores_list = [] | |
skipped_questions = [f"{skill} ({question})" for skill, user_code, question in user_responses if user_code is None] | |
for skill, score, is_ai in results: | |
if skill in user_scores: | |
user_scores[skill] = max(user_scores[skill], score) | |
ai_flags[skill] = ai_flags[skill] or is_ai | |
else: | |
user_scores[skill] = score | |
ai_flags[skill] = is_ai | |
scores_list.append(score) | |
mean_score = np.mean(scores_list) if scores_list else 50 | |
dynamic_threshold = max(40, mean_score) | |
weak_skills = [skill for skill, score in user_scores.items() if score < dynamic_threshold] | |
assessment_results = [ | |
(skill, f"{'■' * int(score//10)}{'-' * (10 - int(score//10))}", f"{score:.2f}%", "AI-Generated" if ai_flags[skill] else "Human-Written") | |
for skill, score in user_scores.items() | |
] | |
assessment_output = tabulate(assessment_results, headers=["Skill", "Progress", "Score", "Origin"], tablefmt="grid") | |
if skipped_questions: | |
assessment_output += f"\nSkipped Questions: {skipped_questions}" | |
assessment_output += f"\nMean Score: {mean_score:.2f}, Dynamic Threshold: {dynamic_threshold:.2f}" | |
assessment_output += f"\nWeak Skills: {weak_skills if weak_skills else 'None'}" | |
skills_to_recommend = weak_skills if weak_skills else user_skills | |
upgrade_flag = not weak_skills | |
recommended_courses = recommend_courses(skills_to_recommend, user_level, upgrade=upgrade_flag) | |
courses_output = tabulate(recommended_courses, headers=["Course", "Organization"], tablefmt="grid") if recommended_courses else "None" | |
recommended_jobs = recommend_jobs(user_skills, user_level) | |
jobs_output = tabulate(recommended_jobs, headers=["Job Title", "Company", "Location"], tablefmt="grid") | |
response = { | |
"user_info": f"User: {user_name}\nSkills: {user_skills}\nLevel: {user_level}", | |
"assessment_results": assessment_output, | |
"recommended_courses": courses_output, | |
"recommended_jobs": jobs_output | |
} | |
logger.info(f"Response: {response}") | |
return jsonify(response) | |
if __name__ == '__main__': | |
app.run(host='0.0.0.0', port=7860) |