Amitpwa commited on
Commit
95e3862
1 Parent(s): f33c946

uploaded dataset and app.py file

Browse files

uploaded dataset and app.py file as it wasn't accepting through terminal

Files changed (2) hide show
  1. app.py +215 -0
  2. ds_salaries.csv +0 -0
app.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pandas as pd
3
+ import seaborn as sns
4
+ import matplotlib.pyplot as plt
5
+ from sklearn.model_selection import train_test_split
6
+ import mlflow
7
+ import mlflow.sklearn
8
+ from sklearn.linear_model import LinearRegression
9
+ from sklearn.tree import DecisionTreeRegressor
10
+ from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
11
+ from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
12
+
13
+
14
+ from sklearn.linear_model import LinearRegression
15
+ from sklearn.tree import DecisionTreeRegressor
16
+ from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
17
+ from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
18
+ from sklearn.model_selection import GridSearchCV
19
+ import mlflow
20
+ import mlflow.sklearn
21
+ import joblib
22
+
23
+ # Load dataset
24
+ df = pd.read_csv('ds_salaries.csv')
25
+
26
+ # EDA
27
+ print(df.head())
28
+ print(df.info())
29
+ print(df.describe())
30
+
31
+ # Visualizations
32
+ sns.pairplot(df)
33
+ plt.show()
34
+
35
+ # Handle missing values
36
+ # Fill numeric columns with mean and categorical columns with mode
37
+ numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
38
+ categorical_cols = df.select_dtypes(include=['object']).columns
39
+
40
+ df[numeric_cols] = df[numeric_cols].apply(lambda x: x.fillna(x.mean()))
41
+ df[categorical_cols] = df[categorical_cols].apply(lambda x: x.fillna(x.mode()[0]))
42
+
43
+ # Drop the salary_currency column as it's not needed for prediction
44
+ df = df.drop(columns=['salary_currency'])
45
+
46
+ # Encode categorical variables
47
+ categorical_columns = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
48
+ df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)
49
+
50
+ # Define features and target variable
51
+ X = df.drop(['salary', 'salary_in_usd'], axis=1)
52
+ y = df['salary_in_usd']
53
+
54
+ # Split the data
55
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
56
+
57
+ # Model training and experiment tracking with MLflow
58
+ models = {
59
+ 'Linear Regression': LinearRegression(),
60
+ 'Decision Tree': DecisionTreeRegressor(),
61
+ 'Random Forest': RandomForestRegressor(),
62
+ 'Gradient Boosting': GradientBoostingRegressor()
63
+ }
64
+
65
+ mlflow.set_experiment('Data Science Salaries Prediction')
66
+
67
+ for model_name, model in models.items():
68
+ with mlflow.start_run(run_name=model_name):
69
+ model.fit(X_train, y_train)
70
+ y_pred = model.predict(X_test)
71
+
72
+ # Log model
73
+ mlflow.sklearn.log_model(model, model_name)
74
+
75
+ # Log metrics
76
+ mlflow.log_metric('RMSE', mean_squared_error(y_test, y_pred, squared=False))
77
+ mlflow.log_metric('MAE', mean_absolute_error(y_test, y_pred))
78
+ mlflow.log_metric('R2', r2_score(y_test, y_pred))
79
+
80
+ # Hyperparameter tuning for the best model (e.g., Random Forest)
81
+ from sklearn.model_selection import GridSearchCV
82
+
83
+ param_grid = {
84
+ 'n_estimators': [100, 200, 300],
85
+ 'max_depth': [None, 10, 20, 30]
86
+ }
87
+
88
+ grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=3, scoring='r2')
89
+ grid_search.fit(X_train, y_train)
90
+
91
+ # Log the best model and parameters
92
+ with mlflow.start_run(run_name='Optimized Random Forest'):
93
+ mlflow.sklearn.log_model(grid_search.best_estimator_, 'Random Forest')
94
+ mlflow.log_params(grid_search.best_params_)
95
+ mlflow.log_metric('Best R2', grid_search.best_score_)
96
+
97
+
98
+
99
+
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+ # Model training and experiment tracking with MLflow
111
+ models = {
112
+ 'Linear Regression': LinearRegression(),
113
+ 'Decision Tree': DecisionTreeRegressor(),
114
+ 'Random Forest': RandomForestRegressor(),
115
+ 'Gradient Boosting': GradientBoostingRegressor()
116
+ }
117
+
118
+ mlflow.set_experiment('Data Science Salaries Prediction')
119
+
120
+ for model_name, model in models.items():
121
+ with mlflow.start_run(run_name=model_name):
122
+ model.fit(X_train, y_train)
123
+ y_pred = model.predict(X_test)
124
+
125
+ # Log model
126
+ mlflow.sklearn.log_model(model, model_name)
127
+
128
+ # Log metrics
129
+ mlflow.log_metric('RMSE', mean_squared_error(y_test, y_pred, squared=False))
130
+ mlflow.log_metric('MAE', mean_absolute_error(y_test, y_pred))
131
+ mlflow.log_metric('R2', r2_score(y_test, y_pred))
132
+
133
+ # Hyperparameter tuning for the best model (e.g., Random Forest)
134
+ param_grid = {
135
+ 'n_estimators': [100, 200, 300],
136
+ 'max_depth': [None, 10, 20, 30]
137
+ }
138
+
139
+ grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=3, scoring='r2')
140
+ grid_search.fit(X_train, y_train)
141
+
142
+ # Log the best model and parameters
143
+ with mlflow.start_run(run_name='Optimized Random Forest'):
144
+ mlflow.sklearn.log_model(grid_search.best_estimator_, 'Random Forest')
145
+ mlflow.log_params(grid_search.best_params_)
146
+ mlflow.log_metric('Best R2', grid_search.best_score_)
147
+
148
+ # Save the best model
149
+ joblib.dump(grid_search.best_estimator_, 'best_model.pkl')
150
+
151
+
152
+
153
+
154
+
155
+
156
+
157
+
158
+
159
+
160
+
161
+
162
+
163
+
164
+
165
+
166
+
167
+
168
+ import streamlit as st
169
+ import pandas as pd
170
+ import joblib
171
+
172
+ # Load the original dataset to get unique values for dropdowns
173
+ df_original = pd.read_csv('ds_salaries.csv')
174
+
175
+ # Load the best model
176
+ best_model = joblib.load('best_model.pkl')
177
+
178
+ # Streamlit app
179
+ st.title('Data Science Salary Predictor')
180
+
181
+ # Input features
182
+ experience_level = st.selectbox('Experience Level', df_original['experience_level'].unique())
183
+ employment_type = st.selectbox('Employment Type', df_original['employment_type'].unique())
184
+ job_title = st.selectbox('Job Title', df_original['job_title'].unique())
185
+ employee_residence = st.selectbox('Employee Residence', df_original['employee_residence'].unique())
186
+ remote_ratio = st.selectbox('Remote Ratio', df_original['remote_ratio'].unique())
187
+ company_location = st.selectbox('Company Location', df_original['company_location'].unique())
188
+ company_size = st.selectbox('Company Size', df_original['company_size'].unique())
189
+
190
+ # Predict salary
191
+ input_data = pd.DataFrame({
192
+ 'work_year': [2023],
193
+ 'experience_level': [experience_level],
194
+ 'employment_type': [employment_type],
195
+ 'job_title': [job_title],
196
+ 'employee_residence': [employee_residence],
197
+ 'remote_ratio': [remote_ratio],
198
+ 'company_location': [company_location],
199
+ 'company_size': [company_size]
200
+ })
201
+
202
+ # Encode categorical variables
203
+ categorical_columns = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
204
+ input_data = pd.get_dummies(input_data, columns=categorical_columns, drop_first=True)
205
+
206
+ # Align input data with training data columns
207
+ input_data = input_data.reindex(columns=X_train.columns, fill_value=0)
208
+
209
+ # Predict the salary
210
+ salary_prediction = best_model.predict(input_data)[0]
211
+ st.write(f'Predicted Salary: ${salary_prediction:.2f}')
212
+
213
+
214
+
215
+
ds_salaries.csv ADDED
The diff for this file is too large to render. See raw diff