File size: 6,602 Bytes
edecf53
 
 
 
 
 
 
 
 
 
ed32658
edecf53
 
 
 
 
ed32658
 
edecf53
 
 
 
 
89f240b
 
 
 
 
 
 
edecf53
 
 
 
 
 
 
 
aa352fb
66f1fae
aa352fb
 
 
 
 
 
 
edecf53
 
89f240b
edecf53
89f240b
 
 
 
edecf53
89f240b
edecf53
89f240b
edecf53
 
aa352fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
edecf53
 
 
 
aa352fb
66f1fae
 
 
 
 
ed32658
 
66f1fae
89f240b
66f1fae
89f240b
ed32658
89f240b
 
 
 
 
 
 
ed32658
 
aa352fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
edecf53
 
66f1fae
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import Optional, Dict, List
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import time
import os

# Set cache directory
os.environ["HF_HOME"] = "/app/cache"
os.environ["TRANSFORMERS_CACHE"] = "/app/cache"

app = FastAPI()

# Load datasets
DATA_DIR = "/app/data/"
job_df = pd.read_csv(os.path.join(DATA_DIR, "Updated_Job_Posting_Dataset.csv"), encoding="latin1")
course_df = pd.read_csv(os.path.join(DATA_DIR, "coursera_course_dataset_v2_no_null.csv"))
coding_df = pd.read_csv(os.path.join(DATA_DIR, "Software Questions.csv"), encoding="latin1")

# Preprocess datasets
coding_df = coding_df.rename(columns={
    'Question': 'question',
    'Answer': 'solutions',
    'Category': 'category',
    'Difficulty': 'difficulty'
})
coding_df.dropna(subset=['question', 'solutions', 'category', 'difficulty'], inplace=True)
job_df.rename(columns={'company_name': 'company', 'required_skills': 'skills'}, inplace=True)
course_df.rename(columns={'Title': 'course_title', 'Skills': 'skills'}, inplace=True)
job_df["job_description"] = job_df["job_description"].fillna("")

# Load BERT model and vectorizer
bert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
vectorizer = TfidfVectorizer()

# Pydantic models for request bodies
class ChallengeRequest(BaseModel):
    skills: List[str]
    difficulty: Optional[str] = None

class AssessmentRequest(BaseModel):
    name: str
    skills: List[str]
    answers: Optional[Dict[str, Dict[str, str]]] = None

# Get coding challenges
def get_coding_challenges(categories: List[str], num_questions=5, difficulty: Optional[str] = None):
    skill_challenges = {}
    for category in categories:
        relevant = coding_df[coding_df["category"].str.contains(category, case=False, na=False)]
        if difficulty:
            relevant = relevant[relevant["difficulty"].str.lower() == difficulty.lower()]
        if not relevant.empty:
            skill_challenges[category] = relevant.sample(min(num_questions, len(relevant)))[["question", "solutions", "difficulty"]].to_dict(orient="records")
        else:
            skill_challenges[category] = []
    return skill_challenges

# Evaluate coding answers
def evaluate_coding_with_time(user_code, correct_code, start_time):
    end_time = time.time()
    execution_time = end_time - start_time
    vectorized = vectorizer.fit_transform([user_code, correct_code])
    similarity = cosine_similarity(vectorized)[0][1] * 100
    if execution_time > 120:
        similarity -= (execution_time - 120) * 0.1
    return round(max(similarity, 0), 2)

# Assign proficiency level
def get_proficiency_level(score):
    if score >= 80:
        return "Expert"
    elif score >= 50:
        return "Intermediate"
    else:
        return "Beginner"

# Recommend courses
def recommend_courses(weak_skills):
    if not weak_skills:
        return []
    courses = course_df[course_df['skills'].str.contains('|'.join(weak_skills), case=False, na=False)]
    return courses[['course_title', 'Organization']].head(5).to_dict(orient="records")

# Recommend jobs
def recommend_jobs(skills):
    if not skills:
        return []
    job_df["job_embeddings"] = job_df["job_description"].apply(lambda x: bert_model.encode(str(x)))
    user_embedding = bert_model.encode(" ".join(skills))
    job_df["BERT_Similarity"] = job_df["job_embeddings"].apply(lambda x: cosine_similarity([x], [user_embedding])[0][0])
    top_jobs = job_df.sort_values(by="BERT_Similarity", ascending=False).head(5)
    return top_jobs[["job_title", "company", "location", "BERT_Similarity"]].to_dict(orient="records")

@app.get("/")
def read_root():
    return {"message": "Skill Assessment API"}

# POST endpoint for fetching challenges
@app.post("/challenges")
def get_user_challenges(request: ChallengeRequest):
    skills = request.skills
    difficulty = request.difficulty
    
    if not skills:
        raise HTTPException(status_code=400, detail="Skills list cannot be empty")

    challenges = get_coding_challenges(skills, difficulty=difficulty)

    # Return only questions and difficulty (exclude solutions for the user)
    return {
        "challenges": {
            category: [
                {"question": challenge["question"], "difficulty": challenge["difficulty"]}
                for challenge in challenge_list
            ]
            for category, challenge_list in challenges.items()
        }
    }

# POST endpoint for assessing answers
@app.post("/assess")
def assess_skills(user_input: AssessmentRequest):
    user_name = user_input.name
    user_skills = user_input.skills

    if not user_skills:
        raise HTTPException(status_code=400, detail="Skills list cannot be empty")

    challenges = get_coding_challenges(user_skills)
    
    user_scores = {}
    for skill, challenge_list in challenges.items():
        if not challenge_list:
            user_scores[skill] = 0
            continue
        
        total_score = 0
        num_questions = len(challenge_list)
        
        if user_input.answers and skill in user_input.answers:
            for challenge in challenge_list:
                question = challenge["question"]
                if question in user_input.answers[skill]:
                    start_time = time.time() - 10  # Simulate execution time
                    user_code = user_input.answers[skill][question]
                    correct_code = challenge["solutions"]
                    score = evaluate_coding_with_time(user_code, correct_code, start_time)
                    total_score += score
                else:
                    total_score += 0
        else:
            total_score = 50 * num_questions  # Default score for unattempted questions
        
        user_scores[skill] = round(total_score / num_questions, 2)

    proficiency_levels = {skill: get_proficiency_level(score) for skill, score in user_scores.items()}
    weak_skills = [skill for skill, level in proficiency_levels.items() if level in ["Beginner", "Intermediate"]]

    courses = recommend_courses(weak_skills)
    jobs = recommend_jobs(user_skills)

    return {
        "name": user_name,
        "skills": user_skills,
        "scores": user_scores,
        "proficiency_levels": proficiency_levels,
        "recommended_courses": courses,
        "recommended_jobs": jobs
    }

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)