Amitpwa commited on
Commit
dfa4582
1 Parent(s): 0294945

uploaded all files

Browse files

added requirements file

Files changed (3) hide show
  1. app.py +92 -0
  2. plaintext +8 -0
  3. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.linear_model import LinearRegression
5
+ from sklearn.tree import DecisionTreeRegressor
6
+ from sklearn.ensemble import RandomForestRegressor
7
+ from sklearn.model_selection import GridSearchCV
8
+
9
+ # import joblib
10
+
11
+ # Load dataset
12
+ df = pd.read_csv('ds_salaries.csv')
13
+ # Load the original dataset to get unique values for dropdowns
14
+ df_original = pd.read_csv('ds_salaries.csv')
15
+
16
+ # Load the best model
17
+ # best_model = joblib.load('best_model.pkl')
18
+
19
+ numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
20
+ categorical_cols = df.select_dtypes(include=['object']).columns
21
+
22
+ df[numeric_cols] = df[numeric_cols].apply(lambda x: x.fillna(x.mean()))
23
+ df[categorical_cols] = df[categorical_cols].apply(lambda x: x.fillna(x.mode()[0]))
24
+
25
+ # Drop the salary_currency column as it's not needed for prediction
26
+ df = df.drop(columns=['salary_currency'])
27
+
28
+ # Encode categorical variables
29
+ categorical_columns = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
30
+ df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)
31
+
32
+ # Define features and target variable
33
+ X = df.drop(['salary', 'salary_in_usd'], axis=1)
34
+ y = df['salary_in_usd']
35
+
36
+ # Split the data
37
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
38
+
39
+ # Model training and experiment tracking with MLflow
40
+ models = {
41
+ 'Linear Regression': LinearRegression(),
42
+ 'Decision Tree': DecisionTreeRegressor(),
43
+ 'Random Forest': RandomForestRegressor(),
44
+ 'Gradient Boosting': GradientBoostingRegressor()
45
+ }
46
+
47
+
48
+ param_grid = {
49
+ 'n_estimators': [100, 200, 300],
50
+ 'max_depth': [None, 10, 20, 30]
51
+ }
52
+
53
+ grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=3, scoring='r2')
54
+ grid_search.fit(X_train, y_train)
55
+
56
+
57
+ # Streamlit app
58
+ st.title('Data Science Salary Predictor')
59
+
60
+ # Input features
61
+ experience_level = st.selectbox('Experience Level', df_original['experience_level'].unique())
62
+ employment_type = st.selectbox('Employment Type', df_original['employment_type'].unique())
63
+ job_title = st.selectbox('Job Title', df_original['job_title'].unique())
64
+ employee_residence = st.selectbox('Employee Residence', df_original['employee_residence'].unique())
65
+ remote_ratio = st.selectbox('Remote Ratio', df_original['remote_ratio'].unique())
66
+ company_location = st.selectbox('Company Location', df_original['company_location'].unique())
67
+ company_size = st.selectbox('Company Size', df_original['company_size'].unique())
68
+
69
+ # Predict salary
70
+ input_data = pd.DataFrame({
71
+ 'work_year': [2023],
72
+ 'experience_level': [experience_level],
73
+ 'employment_type': [employment_type],
74
+ 'job_title': [job_title],
75
+ 'employee_residence': [employee_residence],
76
+ 'remote_ratio': [remote_ratio],
77
+ 'company_location': [company_location],
78
+ 'company_size': [company_size]
79
+ })
80
+
81
+ # Encode categorical variables
82
+ categorical_columns = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
83
+ input_data = pd.get_dummies(input_data, columns=categorical_columns, drop_first=True)
84
+
85
+ # Align input data with training data columns
86
+ input_data = input_data.reindex(columns=X_train.columns, fill_value=0)
87
+
88
+ # joblib.dump(grid_search.best_estimator_, 'best_model.pkl')
89
+ # Predict the salary
90
+ salary_prediction = grid_search.best_estimator_.predict(input_data)[0]
91
+ st.write(f'Predicted Salary: ${salary_prediction:.2f}')
92
+
plaintext ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ app.py
2
+ requirements.txt
3
+ joblib
4
+ pandas
5
+ seaborn
6
+ scikit-learn
7
+ matplotlib
8
+ mlflow
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ joblib
2
+ pandas
3
+ seaborn
4
+ scikit-learn
5
+ matplotlib
6
+ mlflow