Spaces:

Amitpwa
/

mlOps

Sleeping

App Files Files Community

mlOps / app.py

Amitpwa

uploaded dataset and app.py file

95e3862 verified 12 months ago

raw

history blame

6.54 kB


	import pandas as pd
	import seaborn as sns
	import matplotlib.pyplot as plt
	from sklearn.model_selection import train_test_split
	import mlflow
	import mlflow.sklearn
	from sklearn.linear_model import LinearRegression
	from sklearn.tree import DecisionTreeRegressor
	from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
	from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


	from sklearn.linear_model import LinearRegression
	from sklearn.tree import DecisionTreeRegressor
	from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
	from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
	from sklearn.model_selection import GridSearchCV
	import mlflow
	import mlflow.sklearn
	import joblib

	# Load dataset
	df = pd.read_csv('ds_salaries.csv')

	# EDA
	print(df.head())
	print(df.info())
	print(df.describe())

	# Visualizations
	sns.pairplot(df)
	plt.show()

	# Handle missing values
	# Fill numeric columns with mean and categorical columns with mode
	numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
	categorical_cols = df.select_dtypes(include=['object']).columns

	df[numeric_cols] = df[numeric_cols].apply(lambda x: x.fillna(x.mean()))
	df[categorical_cols] = df[categorical_cols].apply(lambda x: x.fillna(x.mode()[0]))

	# Drop the salary_currency column as it's not needed for prediction
	df = df.drop(columns=['salary_currency'])

	# Encode categorical variables
	categorical_columns = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
	df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

	# Define features and target variable
	X = df.drop(['salary', 'salary_in_usd'], axis=1)
	y = df['salary_in_usd']

	# Split the data
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	# Model training and experiment tracking with MLflow
	models = {
	'Linear Regression': LinearRegression(),
	'Decision Tree': DecisionTreeRegressor(),
	'Random Forest': RandomForestRegressor(),
	'Gradient Boosting': GradientBoostingRegressor()
	}

	mlflow.set_experiment('Data Science Salaries Prediction')

	for model_name, model in models.items():
	with mlflow.start_run(run_name=model_name):
	model.fit(X_train, y_train)
	y_pred = model.predict(X_test)

	# Log model
	mlflow.sklearn.log_model(model, model_name)

	# Log metrics
	mlflow.log_metric('RMSE', mean_squared_error(y_test, y_pred, squared=False))
	mlflow.log_metric('MAE', mean_absolute_error(y_test, y_pred))
	mlflow.log_metric('R2', r2_score(y_test, y_pred))

	# Hyperparameter tuning for the best model (e.g., Random Forest)
	from sklearn.model_selection import GridSearchCV

	param_grid = {
	'n_estimators': [100, 200, 300],
	'max_depth': [None, 10, 20, 30]
	}

	grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=3, scoring='r2')
	grid_search.fit(X_train, y_train)

	# Log the best model and parameters
	with mlflow.start_run(run_name='Optimized Random Forest'):
	mlflow.sklearn.log_model(grid_search.best_estimator_, 'Random Forest')
	mlflow.log_params(grid_search.best_params_)
	mlflow.log_metric('Best R2', grid_search.best_score_)














	# Model training and experiment tracking with MLflow
	models = {
	'Linear Regression': LinearRegression(),
	'Decision Tree': DecisionTreeRegressor(),
	'Random Forest': RandomForestRegressor(),
	'Gradient Boosting': GradientBoostingRegressor()
	}

	mlflow.set_experiment('Data Science Salaries Prediction')

	for model_name, model in models.items():
	with mlflow.start_run(run_name=model_name):
	model.fit(X_train, y_train)
	y_pred = model.predict(X_test)

	# Log model
	mlflow.sklearn.log_model(model, model_name)

	# Log metrics
	mlflow.log_metric('RMSE', mean_squared_error(y_test, y_pred, squared=False))
	mlflow.log_metric('MAE', mean_absolute_error(y_test, y_pred))
	mlflow.log_metric('R2', r2_score(y_test, y_pred))

	# Hyperparameter tuning for the best model (e.g., Random Forest)
	param_grid = {
	'n_estimators': [100, 200, 300],
	'max_depth': [None, 10, 20, 30]
	}

	grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=3, scoring='r2')
	grid_search.fit(X_train, y_train)

	# Log the best model and parameters
	with mlflow.start_run(run_name='Optimized Random Forest'):
	mlflow.sklearn.log_model(grid_search.best_estimator_, 'Random Forest')
	mlflow.log_params(grid_search.best_params_)
	mlflow.log_metric('Best R2', grid_search.best_score_)

	# Save the best model
	joblib.dump(grid_search.best_estimator_, 'best_model.pkl')


















	import streamlit as st
	import pandas as pd
	import joblib

	# Load the original dataset to get unique values for dropdowns
	df_original = pd.read_csv('ds_salaries.csv')

	# Load the best model
	best_model = joblib.load('best_model.pkl')

	# Streamlit app
	st.title('Data Science Salary Predictor')

	# Input features
	experience_level = st.selectbox('Experience Level', df_original['experience_level'].unique())
	employment_type = st.selectbox('Employment Type', df_original['employment_type'].unique())
	job_title = st.selectbox('Job Title', df_original['job_title'].unique())
	employee_residence = st.selectbox('Employee Residence', df_original['employee_residence'].unique())
	remote_ratio = st.selectbox('Remote Ratio', df_original['remote_ratio'].unique())
	company_location = st.selectbox('Company Location', df_original['company_location'].unique())
	company_size = st.selectbox('Company Size', df_original['company_size'].unique())

	# Predict salary
	input_data = pd.DataFrame({
	'work_year': [2023],
	'experience_level': [experience_level],
	'employment_type': [employment_type],
	'job_title': [job_title],
	'employee_residence': [employee_residence],
	'remote_ratio': [remote_ratio],
	'company_location': [company_location],
	'company_size': [company_size]
	})

	# Encode categorical variables
	categorical_columns = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
	input_data = pd.get_dummies(input_data, columns=categorical_columns, drop_first=True)

	# Align input data with training data columns
	input_data = input_data.reindex(columns=X_train.columns, fill_value=0)

	# Predict the salary
	salary_prediction = best_model.predict(input_data)[0]
	st.write(f'Predicted Salary: ${salary_prediction:.2f}')