Amitpwa commited on
Commit
6a41c90
·
verified ·
1 Parent(s): b046cf0

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -215
app.py DELETED
@@ -1,215 +0,0 @@
1
-
2
- import pandas as pd
3
- import seaborn as sns
4
- import matplotlib.pyplot as plt
5
- from sklearn.model_selection import train_test_split
6
- import mlflow
7
- import mlflow.sklearn
8
- from sklearn.linear_model import LinearRegression
9
- from sklearn.tree import DecisionTreeRegressor
10
- from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
11
- from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
12
-
13
-
14
- from sklearn.linear_model import LinearRegression
15
- from sklearn.tree import DecisionTreeRegressor
16
- from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
17
- from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
18
- from sklearn.model_selection import GridSearchCV
19
- import mlflow
20
- import mlflow.sklearn
21
- import joblib
22
-
23
- # Load dataset
24
- df = pd.read_csv('ds_salaries.csv')
25
-
26
- # EDA
27
- print(df.head())
28
- print(df.info())
29
- print(df.describe())
30
-
31
- # Visualizations
32
- sns.pairplot(df)
33
- plt.show()
34
-
35
- # Handle missing values
36
- # Fill numeric columns with mean and categorical columns with mode
37
- numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
38
- categorical_cols = df.select_dtypes(include=['object']).columns
39
-
40
- df[numeric_cols] = df[numeric_cols].apply(lambda x: x.fillna(x.mean()))
41
- df[categorical_cols] = df[categorical_cols].apply(lambda x: x.fillna(x.mode()[0]))
42
-
43
- # Drop the salary_currency column as it's not needed for prediction
44
- df = df.drop(columns=['salary_currency'])
45
-
46
- # Encode categorical variables
47
- categorical_columns = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
48
- df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)
49
-
50
- # Define features and target variable
51
- X = df.drop(['salary', 'salary_in_usd'], axis=1)
52
- y = df['salary_in_usd']
53
-
54
- # Split the data
55
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
56
-
57
- # Model training and experiment tracking with MLflow
58
- models = {
59
- 'Linear Regression': LinearRegression(),
60
- 'Decision Tree': DecisionTreeRegressor(),
61
- 'Random Forest': RandomForestRegressor(),
62
- 'Gradient Boosting': GradientBoostingRegressor()
63
- }
64
-
65
- mlflow.set_experiment('Data Science Salaries Prediction')
66
-
67
- for model_name, model in models.items():
68
- with mlflow.start_run(run_name=model_name):
69
- model.fit(X_train, y_train)
70
- y_pred = model.predict(X_test)
71
-
72
- # Log model
73
- mlflow.sklearn.log_model(model, model_name)
74
-
75
- # Log metrics
76
- mlflow.log_metric('RMSE', mean_squared_error(y_test, y_pred, squared=False))
77
- mlflow.log_metric('MAE', mean_absolute_error(y_test, y_pred))
78
- mlflow.log_metric('R2', r2_score(y_test, y_pred))
79
-
80
- # Hyperparameter tuning for the best model (e.g., Random Forest)
81
- from sklearn.model_selection import GridSearchCV
82
-
83
- param_grid = {
84
- 'n_estimators': [100, 200, 300],
85
- 'max_depth': [None, 10, 20, 30]
86
- }
87
-
88
- grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=3, scoring='r2')
89
- grid_search.fit(X_train, y_train)
90
-
91
- # Log the best model and parameters
92
- with mlflow.start_run(run_name='Optimized Random Forest'):
93
- mlflow.sklearn.log_model(grid_search.best_estimator_, 'Random Forest')
94
- mlflow.log_params(grid_search.best_params_)
95
- mlflow.log_metric('Best R2', grid_search.best_score_)
96
-
97
-
98
-
99
-
100
-
101
-
102
-
103
-
104
-
105
-
106
-
107
-
108
-
109
-
110
- # Model training and experiment tracking with MLflow
111
- models = {
112
- 'Linear Regression': LinearRegression(),
113
- 'Decision Tree': DecisionTreeRegressor(),
114
- 'Random Forest': RandomForestRegressor(),
115
- 'Gradient Boosting': GradientBoostingRegressor()
116
- }
117
-
118
- mlflow.set_experiment('Data Science Salaries Prediction')
119
-
120
- for model_name, model in models.items():
121
- with mlflow.start_run(run_name=model_name):
122
- model.fit(X_train, y_train)
123
- y_pred = model.predict(X_test)
124
-
125
- # Log model
126
- mlflow.sklearn.log_model(model, model_name)
127
-
128
- # Log metrics
129
- mlflow.log_metric('RMSE', mean_squared_error(y_test, y_pred, squared=False))
130
- mlflow.log_metric('MAE', mean_absolute_error(y_test, y_pred))
131
- mlflow.log_metric('R2', r2_score(y_test, y_pred))
132
-
133
- # Hyperparameter tuning for the best model (e.g., Random Forest)
134
- param_grid = {
135
- 'n_estimators': [100, 200, 300],
136
- 'max_depth': [None, 10, 20, 30]
137
- }
138
-
139
- grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=3, scoring='r2')
140
- grid_search.fit(X_train, y_train)
141
-
142
- # Log the best model and parameters
143
- with mlflow.start_run(run_name='Optimized Random Forest'):
144
- mlflow.sklearn.log_model(grid_search.best_estimator_, 'Random Forest')
145
- mlflow.log_params(grid_search.best_params_)
146
- mlflow.log_metric('Best R2', grid_search.best_score_)
147
-
148
- # Save the best model
149
- joblib.dump(grid_search.best_estimator_, 'best_model.pkl')
150
-
151
-
152
-
153
-
154
-
155
-
156
-
157
-
158
-
159
-
160
-
161
-
162
-
163
-
164
-
165
-
166
-
167
-
168
- import streamlit as st
169
- import pandas as pd
170
- import joblib
171
-
172
- # Load the original dataset to get unique values for dropdowns
173
- df_original = pd.read_csv('ds_salaries.csv')
174
-
175
- # Load the best model
176
- best_model = joblib.load('best_model.pkl')
177
-
178
- # Streamlit app
179
- st.title('Data Science Salary Predictor')
180
-
181
- # Input features
182
- experience_level = st.selectbox('Experience Level', df_original['experience_level'].unique())
183
- employment_type = st.selectbox('Employment Type', df_original['employment_type'].unique())
184
- job_title = st.selectbox('Job Title', df_original['job_title'].unique())
185
- employee_residence = st.selectbox('Employee Residence', df_original['employee_residence'].unique())
186
- remote_ratio = st.selectbox('Remote Ratio', df_original['remote_ratio'].unique())
187
- company_location = st.selectbox('Company Location', df_original['company_location'].unique())
188
- company_size = st.selectbox('Company Size', df_original['company_size'].unique())
189
-
190
- # Predict salary
191
- input_data = pd.DataFrame({
192
- 'work_year': [2023],
193
- 'experience_level': [experience_level],
194
- 'employment_type': [employment_type],
195
- 'job_title': [job_title],
196
- 'employee_residence': [employee_residence],
197
- 'remote_ratio': [remote_ratio],
198
- 'company_location': [company_location],
199
- 'company_size': [company_size]
200
- })
201
-
202
- # Encode categorical variables
203
- categorical_columns = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
204
- input_data = pd.get_dummies(input_data, columns=categorical_columns, drop_first=True)
205
-
206
- # Align input data with training data columns
207
- input_data = input_data.reindex(columns=X_train.columns, fill_value=0)
208
-
209
- # Predict the salary
210
- salary_prediction = best_model.predict(input_data)[0]
211
- st.write(f'Predicted Salary: ${salary_prediction:.2f}')
212
-
213
-
214
-
215
-