|
|
|
import pandas as pd |
|
import seaborn as sns |
|
import matplotlib.pyplot as plt |
|
from sklearn.model_selection import train_test_split |
|
import mlflow |
|
import mlflow.sklearn |
|
from sklearn.linear_model import LinearRegression |
|
from sklearn.tree import DecisionTreeRegressor |
|
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor |
|
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score |
|
|
|
|
|
from sklearn.linear_model import LinearRegression |
|
from sklearn.tree import DecisionTreeRegressor |
|
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor |
|
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score |
|
from sklearn.model_selection import GridSearchCV |
|
import mlflow |
|
import mlflow.sklearn |
|
import joblib |
|
|
|
|
|
df = pd.read_csv('ds_salaries.csv') |
|
|
|
|
|
print(df.head()) |
|
print(df.info()) |
|
print(df.describe()) |
|
|
|
|
|
sns.pairplot(df) |
|
plt.show() |
|
|
|
|
|
|
|
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns |
|
categorical_cols = df.select_dtypes(include=['object']).columns |
|
|
|
df[numeric_cols] = df[numeric_cols].apply(lambda x: x.fillna(x.mean())) |
|
df[categorical_cols] = df[categorical_cols].apply(lambda x: x.fillna(x.mode()[0])) |
|
|
|
|
|
df = df.drop(columns=['salary_currency']) |
|
|
|
|
|
categorical_columns = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size'] |
|
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True) |
|
|
|
|
|
X = df.drop(['salary', 'salary_in_usd'], axis=1) |
|
y = df['salary_in_usd'] |
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
|
|
|
|
|
models = { |
|
'Linear Regression': LinearRegression(), |
|
'Decision Tree': DecisionTreeRegressor(), |
|
'Random Forest': RandomForestRegressor(), |
|
'Gradient Boosting': GradientBoostingRegressor() |
|
} |
|
|
|
mlflow.set_experiment('Data Science Salaries Prediction') |
|
|
|
for model_name, model in models.items(): |
|
with mlflow.start_run(run_name=model_name): |
|
model.fit(X_train, y_train) |
|
y_pred = model.predict(X_test) |
|
|
|
|
|
mlflow.sklearn.log_model(model, model_name) |
|
|
|
|
|
mlflow.log_metric('RMSE', mean_squared_error(y_test, y_pred, squared=False)) |
|
mlflow.log_metric('MAE', mean_absolute_error(y_test, y_pred)) |
|
mlflow.log_metric('R2', r2_score(y_test, y_pred)) |
|
|
|
|
|
from sklearn.model_selection import GridSearchCV |
|
|
|
param_grid = { |
|
'n_estimators': [100, 200, 300], |
|
'max_depth': [None, 10, 20, 30] |
|
} |
|
|
|
grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=3, scoring='r2') |
|
grid_search.fit(X_train, y_train) |
|
|
|
|
|
with mlflow.start_run(run_name='Optimized Random Forest'): |
|
mlflow.sklearn.log_model(grid_search.best_estimator_, 'Random Forest') |
|
mlflow.log_params(grid_search.best_params_) |
|
mlflow.log_metric('Best R2', grid_search.best_score_) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
models = { |
|
'Linear Regression': LinearRegression(), |
|
'Decision Tree': DecisionTreeRegressor(), |
|
'Random Forest': RandomForestRegressor(), |
|
'Gradient Boosting': GradientBoostingRegressor() |
|
} |
|
|
|
mlflow.set_experiment('Data Science Salaries Prediction') |
|
|
|
for model_name, model in models.items(): |
|
with mlflow.start_run(run_name=model_name): |
|
model.fit(X_train, y_train) |
|
y_pred = model.predict(X_test) |
|
|
|
|
|
mlflow.sklearn.log_model(model, model_name) |
|
|
|
|
|
mlflow.log_metric('RMSE', mean_squared_error(y_test, y_pred, squared=False)) |
|
mlflow.log_metric('MAE', mean_absolute_error(y_test, y_pred)) |
|
mlflow.log_metric('R2', r2_score(y_test, y_pred)) |
|
|
|
|
|
param_grid = { |
|
'n_estimators': [100, 200, 300], |
|
'max_depth': [None, 10, 20, 30] |
|
} |
|
|
|
grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=3, scoring='r2') |
|
grid_search.fit(X_train, y_train) |
|
|
|
|
|
with mlflow.start_run(run_name='Optimized Random Forest'): |
|
mlflow.sklearn.log_model(grid_search.best_estimator_, 'Random Forest') |
|
mlflow.log_params(grid_search.best_params_) |
|
mlflow.log_metric('Best R2', grid_search.best_score_) |
|
|
|
|
|
joblib.dump(grid_search.best_estimator_, 'best_model.pkl') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import streamlit as st |
|
import pandas as pd |
|
import joblib |
|
|
|
|
|
df_original = pd.read_csv('ds_salaries.csv') |
|
|
|
|
|
best_model = joblib.load('best_model.pkl') |
|
|
|
|
|
st.title('Data Science Salary Predictor') |
|
|
|
|
|
experience_level = st.selectbox('Experience Level', df_original['experience_level'].unique()) |
|
employment_type = st.selectbox('Employment Type', df_original['employment_type'].unique()) |
|
job_title = st.selectbox('Job Title', df_original['job_title'].unique()) |
|
employee_residence = st.selectbox('Employee Residence', df_original['employee_residence'].unique()) |
|
remote_ratio = st.selectbox('Remote Ratio', df_original['remote_ratio'].unique()) |
|
company_location = st.selectbox('Company Location', df_original['company_location'].unique()) |
|
company_size = st.selectbox('Company Size', df_original['company_size'].unique()) |
|
|
|
|
|
input_data = pd.DataFrame({ |
|
'work_year': [2023], |
|
'experience_level': [experience_level], |
|
'employment_type': [employment_type], |
|
'job_title': [job_title], |
|
'employee_residence': [employee_residence], |
|
'remote_ratio': [remote_ratio], |
|
'company_location': [company_location], |
|
'company_size': [company_size] |
|
}) |
|
|
|
|
|
categorical_columns = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size'] |
|
input_data = pd.get_dummies(input_data, columns=categorical_columns, drop_first=True) |
|
|
|
|
|
input_data = input_data.reindex(columns=X_train.columns, fill_value=0) |
|
|
|
|
|
salary_prediction = best_model.predict(input_data)[0] |
|
st.write(f'Predicted Salary: ${salary_prediction:.2f}') |
|
|
|
|
|
|
|
|
|
|