File size: 6,541 Bytes
95e3862 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 |
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
import mlflow
import mlflow.sklearn
import joblib
# Load dataset
df = pd.read_csv('ds_salaries.csv')
# EDA
print(df.head())
print(df.info())
print(df.describe())
# Visualizations
sns.pairplot(df)
plt.show()
# Handle missing values
# Fill numeric columns with mean and categorical columns with mode
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns
df[numeric_cols] = df[numeric_cols].apply(lambda x: x.fillna(x.mean()))
df[categorical_cols] = df[categorical_cols].apply(lambda x: x.fillna(x.mode()[0]))
# Drop the salary_currency column as it's not needed for prediction
df = df.drop(columns=['salary_currency'])
# Encode categorical variables
categorical_columns = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)
# Define features and target variable
X = df.drop(['salary', 'salary_in_usd'], axis=1)
y = df['salary_in_usd']
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Model training and experiment tracking with MLflow
models = {
'Linear Regression': LinearRegression(),
'Decision Tree': DecisionTreeRegressor(),
'Random Forest': RandomForestRegressor(),
'Gradient Boosting': GradientBoostingRegressor()
}
mlflow.set_experiment('Data Science Salaries Prediction')
for model_name, model in models.items():
with mlflow.start_run(run_name=model_name):
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Log model
mlflow.sklearn.log_model(model, model_name)
# Log metrics
mlflow.log_metric('RMSE', mean_squared_error(y_test, y_pred, squared=False))
mlflow.log_metric('MAE', mean_absolute_error(y_test, y_pred))
mlflow.log_metric('R2', r2_score(y_test, y_pred))
# Hyperparameter tuning for the best model (e.g., Random Forest)
from sklearn.model_selection import GridSearchCV
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [None, 10, 20, 30]
}
grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=3, scoring='r2')
grid_search.fit(X_train, y_train)
# Log the best model and parameters
with mlflow.start_run(run_name='Optimized Random Forest'):
mlflow.sklearn.log_model(grid_search.best_estimator_, 'Random Forest')
mlflow.log_params(grid_search.best_params_)
mlflow.log_metric('Best R2', grid_search.best_score_)
# Model training and experiment tracking with MLflow
models = {
'Linear Regression': LinearRegression(),
'Decision Tree': DecisionTreeRegressor(),
'Random Forest': RandomForestRegressor(),
'Gradient Boosting': GradientBoostingRegressor()
}
mlflow.set_experiment('Data Science Salaries Prediction')
for model_name, model in models.items():
with mlflow.start_run(run_name=model_name):
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Log model
mlflow.sklearn.log_model(model, model_name)
# Log metrics
mlflow.log_metric('RMSE', mean_squared_error(y_test, y_pred, squared=False))
mlflow.log_metric('MAE', mean_absolute_error(y_test, y_pred))
mlflow.log_metric('R2', r2_score(y_test, y_pred))
# Hyperparameter tuning for the best model (e.g., Random Forest)
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [None, 10, 20, 30]
}
grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=3, scoring='r2')
grid_search.fit(X_train, y_train)
# Log the best model and parameters
with mlflow.start_run(run_name='Optimized Random Forest'):
mlflow.sklearn.log_model(grid_search.best_estimator_, 'Random Forest')
mlflow.log_params(grid_search.best_params_)
mlflow.log_metric('Best R2', grid_search.best_score_)
# Save the best model
joblib.dump(grid_search.best_estimator_, 'best_model.pkl')
import streamlit as st
import pandas as pd
import joblib
# Load the original dataset to get unique values for dropdowns
df_original = pd.read_csv('ds_salaries.csv')
# Load the best model
best_model = joblib.load('best_model.pkl')
# Streamlit app
st.title('Data Science Salary Predictor')
# Input features
experience_level = st.selectbox('Experience Level', df_original['experience_level'].unique())
employment_type = st.selectbox('Employment Type', df_original['employment_type'].unique())
job_title = st.selectbox('Job Title', df_original['job_title'].unique())
employee_residence = st.selectbox('Employee Residence', df_original['employee_residence'].unique())
remote_ratio = st.selectbox('Remote Ratio', df_original['remote_ratio'].unique())
company_location = st.selectbox('Company Location', df_original['company_location'].unique())
company_size = st.selectbox('Company Size', df_original['company_size'].unique())
# Predict salary
input_data = pd.DataFrame({
'work_year': [2023],
'experience_level': [experience_level],
'employment_type': [employment_type],
'job_title': [job_title],
'employee_residence': [employee_residence],
'remote_ratio': [remote_ratio],
'company_location': [company_location],
'company_size': [company_size]
})
# Encode categorical variables
categorical_columns = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
input_data = pd.get_dummies(input_data, columns=categorical_columns, drop_first=True)
# Align input data with training data columns
input_data = input_data.reindex(columns=X_train.columns, fill_value=0)
# Predict the salary
salary_prediction = best_model.predict(input_data)[0]
st.write(f'Predicted Salary: ${salary_prediction:.2f}')
|