File size: 6,541 Bytes
95e3862
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
import mlflow
import mlflow.sklearn
import joblib

# Load dataset
df = pd.read_csv('ds_salaries.csv')

# EDA
print(df.head())
print(df.info())
print(df.describe())

# Visualizations
sns.pairplot(df)
plt.show()

# Handle missing values
# Fill numeric columns with mean and categorical columns with mode
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

df[numeric_cols] = df[numeric_cols].apply(lambda x: x.fillna(x.mean()))
df[categorical_cols] = df[categorical_cols].apply(lambda x: x.fillna(x.mode()[0]))

# Drop the salary_currency column as it's not needed for prediction
df = df.drop(columns=['salary_currency'])

# Encode categorical variables
categorical_columns = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Define features and target variable
X = df.drop(['salary', 'salary_in_usd'], axis=1)
y = df['salary_in_usd']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training and experiment tracking with MLflow
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor()
}

mlflow.set_experiment('Data Science Salaries Prediction')

for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        # Log model
        mlflow.sklearn.log_model(model, model_name)
        
        # Log metrics
        mlflow.log_metric('RMSE', mean_squared_error(y_test, y_pred, squared=False))
        mlflow.log_metric('MAE', mean_absolute_error(y_test, y_pred))
        mlflow.log_metric('R2', r2_score(y_test, y_pred))

# Hyperparameter tuning for the best model (e.g., Random Forest)
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30]
}

grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=3, scoring='r2')
grid_search.fit(X_train, y_train)

# Log the best model and parameters
with mlflow.start_run(run_name='Optimized Random Forest'):
    mlflow.sklearn.log_model(grid_search.best_estimator_, 'Random Forest')
    mlflow.log_params(grid_search.best_params_)
    mlflow.log_metric('Best R2', grid_search.best_score_)














# Model training and experiment tracking with MLflow
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor()
}

mlflow.set_experiment('Data Science Salaries Prediction')

for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        # Log model
        mlflow.sklearn.log_model(model, model_name)
        
        # Log metrics
        mlflow.log_metric('RMSE', mean_squared_error(y_test, y_pred, squared=False))
        mlflow.log_metric('MAE', mean_absolute_error(y_test, y_pred))
        mlflow.log_metric('R2', r2_score(y_test, y_pred))

# Hyperparameter tuning for the best model (e.g., Random Forest)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30]
}

grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=3, scoring='r2')
grid_search.fit(X_train, y_train)

# Log the best model and parameters
with mlflow.start_run(run_name='Optimized Random Forest'):
    mlflow.sklearn.log_model(grid_search.best_estimator_, 'Random Forest')
    mlflow.log_params(grid_search.best_params_)
    mlflow.log_metric('Best R2', grid_search.best_score_)

# Save the best model
joblib.dump(grid_search.best_estimator_, 'best_model.pkl')


















import streamlit as st
import pandas as pd
import joblib

# Load the original dataset to get unique values for dropdowns
df_original = pd.read_csv('ds_salaries.csv')

# Load the best model
best_model = joblib.load('best_model.pkl')

# Streamlit app
st.title('Data Science Salary Predictor')

# Input features
experience_level = st.selectbox('Experience Level', df_original['experience_level'].unique())
employment_type = st.selectbox('Employment Type', df_original['employment_type'].unique())
job_title = st.selectbox('Job Title', df_original['job_title'].unique())
employee_residence = st.selectbox('Employee Residence', df_original['employee_residence'].unique())
remote_ratio = st.selectbox('Remote Ratio', df_original['remote_ratio'].unique())
company_location = st.selectbox('Company Location', df_original['company_location'].unique())
company_size = st.selectbox('Company Size', df_original['company_size'].unique())

# Predict salary
input_data = pd.DataFrame({
    'work_year': [2023],
    'experience_level': [experience_level],
    'employment_type': [employment_type],
    'job_title': [job_title],
    'employee_residence': [employee_residence],
    'remote_ratio': [remote_ratio],
    'company_location': [company_location],
    'company_size': [company_size]
})

# Encode categorical variables
categorical_columns = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
input_data = pd.get_dummies(input_data, columns=categorical_columns, drop_first=True)

# Align input data with training data columns
input_data = input_data.reindex(columns=X_train.columns, fill_value=0)

# Predict the salary
salary_prediction = best_model.predict(input_data)[0]
st.write(f'Predicted Salary: ${salary_prediction:.2f}')