Upload 6 files
Browse files- app.py +137 -0
- datasets/cleaned_survey_results_public.csv +0 -0
- models/CatBoostRegressor.joblib +3 -0
- models/LGBMRegressor.joblib +3 -0
- models/XGBoostRegressor.joblib +3 -0
- requirements.txt +7 -0
app.py
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import joblib
|
3 |
+
import pandas as pd
|
4 |
+
import streamlit as st
|
5 |
+
from sklearn.model_selection import train_test_split
|
6 |
+
from sklearn.metrics import r2_score
|
7 |
+
from typing import List, Dict, Any
|
8 |
+
|
9 |
+
# Constants for directories and file names
|
10 |
+
DIR = "C:\\Users\\Afshin\\Desktop\\10_Projects\\Project_3_Developer_Salary_Prediction\\"
|
11 |
+
# Constants for directories and file names
|
12 |
+
MODEL_DIR = DIR + 'models'
|
13 |
+
DATA_DIR = DIR + 'datasets'
|
14 |
+
DATA_FILE = 'cleaned_survey_results_public_v2.csv'
|
15 |
+
MODEL_NAMES = [
|
16 |
+
#'CatBoost Regressor',
|
17 |
+
'XGBoost Regressor',
|
18 |
+
'LGBM Regressor',
|
19 |
+
]
|
20 |
+
|
21 |
+
def load_models(model_names: List[str]) -> Dict[str, Any]:
|
22 |
+
"""Load machine learning models from disk."""
|
23 |
+
models = {}
|
24 |
+
for name in model_names:
|
25 |
+
path = os.path.join(MODEL_DIR, f"{name.replace(' ', '')}.joblib")
|
26 |
+
try:
|
27 |
+
models[name] = joblib.load(path)
|
28 |
+
except Exception as e:
|
29 |
+
st.error(f"Error loading model {name}: {str(e)}")
|
30 |
+
return models
|
31 |
+
|
32 |
+
# Load models
|
33 |
+
models = load_models(MODEL_NAMES)
|
34 |
+
|
35 |
+
# Load dataset
|
36 |
+
data_path = os.path.join(DATA_DIR, DATA_FILE)
|
37 |
+
df = pd.read_csv(data_path)
|
38 |
+
|
39 |
+
# Prepare features and target
|
40 |
+
X = df.drop(columns=['Salary'])
|
41 |
+
y = df['Salary']
|
42 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=123)
|
43 |
+
|
44 |
+
# Pre-defined input choices
|
45 |
+
input_choices = {
|
46 |
+
'MainBranch': df.MainBranch.unique().tolist(),
|
47 |
+
'Country': X.Country.unique().tolist(),
|
48 |
+
'EducationLevel': X.EducationLevel.unique().tolist(),
|
49 |
+
'RemoteWork': df.RemoteWork.unique().tolist(),
|
50 |
+
}
|
51 |
+
|
52 |
+
# Pre-computed statistics for default values
|
53 |
+
default_comp = float(df.CompTotal.mean()) # Default CompTotal
|
54 |
+
max_comp = float(df.CompTotal.max() * 1.5)
|
55 |
+
default_years = 3.0 # Default years of experience
|
56 |
+
max_years = float(df.YearsOfExperience.max() * 1.5)
|
57 |
+
|
58 |
+
# Precompute predictions for training set
|
59 |
+
y_train_predictions = {name: model.predict(X_train) for name, model in models.items()}
|
60 |
+
|
61 |
+
def load_and_predict(sample: pd.DataFrame) -> pd.DataFrame:
|
62 |
+
"""Predict salary using loaded models and evaluate statistics."""
|
63 |
+
results = []
|
64 |
+
|
65 |
+
for name, model in models.items():
|
66 |
+
try:
|
67 |
+
salary_pred = model.predict(sample)[0]
|
68 |
+
results.append({
|
69 |
+
'Model': name,
|
70 |
+
'Predicted Salary': salary_pred,
|
71 |
+
'R2 Score (%)': r2_score(y_train, y_train_predictions[name]) * 100,
|
72 |
+
})
|
73 |
+
except Exception as e:
|
74 |
+
st.error(f"Error during prediction with model {name}: {str(e)}")
|
75 |
+
|
76 |
+
return pd.DataFrame(results).sort_values(by='R2 Score (%)', ascending=False).reset_index(drop=True)
|
77 |
+
|
78 |
+
# Streamlit UI setup
|
79 |
+
st.set_page_config(page_title="Developer Salary Prediction App", page_icon="🤑", layout="wide")
|
80 |
+
st.title("🤑 **Developer Salary Prediction**")
|
81 |
+
|
82 |
+
# Sidebar inputs
|
83 |
+
st.sidebar.header("Input Information")
|
84 |
+
mainbranch = st.sidebar.selectbox("**MainBranch**", options=input_choices['MainBranch'])
|
85 |
+
country = st.sidebar.selectbox("**Country**", options=input_choices['Country'])
|
86 |
+
educationlevel = st.sidebar.selectbox("**Education Level**", options=input_choices['EducationLevel'])
|
87 |
+
remotework = st.sidebar.selectbox("**Remote Work**", options=input_choices['RemoteWork'])
|
88 |
+
comptotal = st.sidebar.number_input("**CompTotal**", min_value=0.0, max_value=max_comp, value=default_comp)
|
89 |
+
yearsofexperience = st.sidebar.number_input("**Years of Experience**", min_value=0.0, max_value=max_years, value=default_years)
|
90 |
+
|
91 |
+
# Handling predictions
|
92 |
+
if st.sidebar.button(label=':rainbow[Predict Salary]'):
|
93 |
+
input_data = pd.DataFrame(
|
94 |
+
[[mainbranch, country, educationlevel, remotework, comptotal, yearsofexperience]],
|
95 |
+
columns=['MainBranch', 'Country', 'EducationLevel', 'RemoteWork', 'CompTotal', 'YearsOfExperience'])
|
96 |
+
|
97 |
+
results_df = load_and_predict(input_data)
|
98 |
+
|
99 |
+
if not results_df.empty:
|
100 |
+
st.write("### Prediction Results:")
|
101 |
+
st.dataframe(results_df)
|
102 |
+
|
103 |
+
# Disclaimer Section
|
104 |
+
st.markdown("---")
|
105 |
+
st.text('''
|
106 |
+
>> Developer Salary Prediction App <<
|
107 |
+
This Streamlit application predicts developer salary using multiple machine learning models including LGBM, XGBoost, and Random Forest regressors.
|
108 |
+
Users can input developer information through a user-friendly interface, which includes fields such as country, education level, and years of experience.
|
109 |
+
|
110 |
+
> Features:
|
111 |
+
**Input Components**:
|
112 |
+
- **MainBranch**: Select your main area of expertise in development, such as software engineering, data science, or web development. This selection may influence salary expectations based on the branch's demand and trends.
|
113 |
+
|
114 |
+
- **Country**: Choose your country from the dropdown list. Regions often exhibit varying salary scales due to economic factors, the cost of living, and market demand for tech workers.
|
115 |
+
|
116 |
+
- **Education Level**: Indicate the highest level of education you have completed. Higher educational qualifications often correlate with higher earning potential in the tech industry.
|
117 |
+
|
118 |
+
- **Remote Work**: Specify whether you primarily work remotely, on-site, or in a hybrid setup. Remote work setups can affect salary offers, especially if hiring companies are based in different geographic areas.
|
119 |
+
|
120 |
+
- **CompTotal**: Enter your expected total compensation, which includes salary, bonuses, and other benefits. This field is crucial for setting a base for salary predictions and facilitates comparisons.
|
121 |
+
|
122 |
+
- **Years of Experience**: Provide the number of years you've been in a coding-related job. Generally, more years of experience are associated with higher salaries due to skill accumulation and professional development.
|
123 |
+
|
124 |
+
**Data Processing**:
|
125 |
+
- The app employs a pre-processed dataset, cleaned and prepared for model training.
|
126 |
+
- It utilizes features including country, education level, and years of experience for predictions.
|
127 |
+
- Models are loaded from disk, obtaining predictions based on user-provided input.
|
128 |
+
|
129 |
+
**Prediction**: The app performs predictions with loaded models and calculates performance metrics like R2 score.
|
130 |
+
**Results Display**: The predicted salary and model performance metrics are presented in a user-friendly format.
|
131 |
+
|
132 |
+
> Usage:
|
133 |
+
Fill out the developer information and click "Predict Salary" to derive insights on anticipated salary and model performance.
|
134 |
+
|
135 |
+
> Disclaimer:
|
136 |
+
This application serves educational purposes. Predictions are not guaranteed to be accurate.
|
137 |
+
''')
|
datasets/cleaned_survey_results_public.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
models/CatBoostRegressor.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:807c04de1b414d8c59dac807b89a30fa85f961c0ff3185dbd0f19e065bcadf31
|
3 |
+
size 347038
|
models/LGBMRegressor.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4b589f3a59666c3f2bcc1f6642c0e29e350360c27ba5d6a447d304aea65e45cb
|
3 |
+
size 871178
|
models/XGBoostRegressor.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4d4b8db58c45e903904275bb98611d5a4e08c9fc9cddbfb25c42d16dfc2b5657
|
3 |
+
size 2205896
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas
|
2 |
+
numpy
|
3 |
+
joblib
|
4 |
+
scikit-learn
|
5 |
+
lightgbm
|
6 |
+
xgboost
|
7 |
+
catboost
|