File size: 3,517 Bytes
dfa4582
 
 
 
 
9db4899
dfa4582
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import streamlit as st
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

# import joblib

# Load dataset
df = pd.read_csv('ds_salaries.csv')
# Load the original dataset to get unique values for dropdowns
df_original = pd.read_csv('ds_salaries.csv')

# Load the best model
# best_model = joblib.load('best_model.pkl')

numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

df[numeric_cols] = df[numeric_cols].apply(lambda x: x.fillna(x.mean()))
df[categorical_cols] = df[categorical_cols].apply(lambda x: x.fillna(x.mode()[0]))

# Drop the salary_currency column as it's not needed for prediction
df = df.drop(columns=['salary_currency'])

# Encode categorical variables
categorical_columns = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Define features and target variable
X = df.drop(['salary', 'salary_in_usd'], axis=1)
y = df['salary_in_usd']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training and experiment tracking with MLflow
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor()
}


param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30]
}

grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=3, scoring='r2')
grid_search.fit(X_train, y_train)


# Streamlit app
st.title('Data Science Salary Predictor')

# Input features
experience_level = st.selectbox('Experience Level', df_original['experience_level'].unique())
employment_type = st.selectbox('Employment Type', df_original['employment_type'].unique())
job_title = st.selectbox('Job Title', df_original['job_title'].unique())
employee_residence = st.selectbox('Employee Residence', df_original['employee_residence'].unique())
remote_ratio = st.selectbox('Remote Ratio', df_original['remote_ratio'].unique())
company_location = st.selectbox('Company Location', df_original['company_location'].unique())
company_size = st.selectbox('Company Size', df_original['company_size'].unique())

# Predict salary
input_data = pd.DataFrame({
    'work_year': [2023],
    'experience_level': [experience_level],
    'employment_type': [employment_type],
    'job_title': [job_title],
    'employee_residence': [employee_residence],
    'remote_ratio': [remote_ratio],
    'company_location': [company_location],
    'company_size': [company_size]
})

# Encode categorical variables
categorical_columns = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
input_data = pd.get_dummies(input_data, columns=categorical_columns, drop_first=True)

# Align input data with training data columns
input_data = input_data.reindex(columns=X_train.columns, fill_value=0)

# joblib.dump(grid_search.best_estimator_, 'best_model.pkl')
# Predict the salary
salary_prediction = grid_search.best_estimator_.predict(input_data)[0]
st.write(f'Predicted Salary: ${salary_prediction:.2f}')