Spaces:
Sleeping
Sleeping
File size: 10,970 Bytes
51f6345 9ff27a2 51f6345 9ff27a2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 |
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import altair as alt
import time
import zipfile
import os
# Page title
st.set_page_config(page_title='ML Model Building', page_icon='π€', layout='wide') # Set layout to wide for better use of space
st.title('π€ ML Model Building')
with st.expander('About this app'):
st.markdown('**What can this app do?**')
st.info('This app allow users to build a machine learning (ML) model in an end-to-end workflow. Particularly, this encompasses data upload, data pre-processing, ML model building and post-model analysis.')
st.markdown('**How to use the app?**')
st.warning('To engage with the app, go to the sidebar and 1. Select a data set and 2. Adjust the model parameters by adjusting the various slider widgets. As a result, this would initiate the ML model building process, display the model results as well as allowing users to download the generated models and accompanying data.')
st.markdown('**Under the hood**')
st.markdown('Data sets:')
st.code('''- Drug solubility data set
''', language='markdown')
st.markdown('Libraries used:')
st.code('''- Pandas for data wrangling
- Scikit-learn for building a machine learning model
- Altair for chart creation
- Streamlit for user interface
''', language='markdown')
# Sidebar for accepting input parameters
with st.sidebar:
# Load data
st.header('1.1. Input data')
st.markdown('**1. Use custom data**')
uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
if uploaded_file is not None:
df = pd.read_csv(uploaded_file, index_col=False)
# Download example data
@st.cache_data
def convert_df(input_df):
return input_df.to_csv(index=False).encode('utf-8')
example_csv = pd.read_csv('https://raw.githubusercontent.com/dataprofessor/data/master/delaney_solubility_with_descriptors.csv')
csv = convert_df(example_csv)
st.download_button(
label="Download example CSV",
data=csv,
file_name='delaney_solubility_with_descriptors.csv',
mime='text/csv',
)
# Select example data
st.markdown('**1.2. Use example data**')
example_data = st.toggle('Load example data')
if example_data:
df = pd.read_csv('https://raw.githubusercontent.com/dataprofessor/data/master/delaney_solubility_with_descriptors.csv')
st.header('2. Set Parameters')
parameter_split_size = st.slider('Data split ratio (% for Training Set)', 10, 90, 80, 5)
st.subheader('2.1. Learning Parameters')
with st.expander('See parameters'):
parameter_n_estimators = st.slider('Number of estimators (n_estimators)', 0, 1000, 100, 100)
parameter_max_features = st.select_slider('Max features (max_features)', options=['all', 'sqrt', 'log2'])
parameter_min_samples_split = st.slider('Minimum number of samples required to split an internal node (min_samples_split)', 2, 10, 2, 1)
parameter_min_samples_leaf = st.slider('Minimum number of samples required to be at a leaf node (min_samples_leaf)', 1, 10, 2, 1)
st.subheader('2.2. General Parameters')
with st.expander('See parameters', expanded=False):
parameter_random_state = st.slider('Seed number (random_state)', 0, 1000, 42, 1)
parameter_criterion = st.select_slider('Performance measure (criterion)', options=['squared_error', 'absolute_error', 'friedman_mse'])
parameter_bootstrap = st.select_slider('Bootstrap samples when building trees (bootstrap)', options=[True, False])
parameter_oob_score = st.select_slider('Whether to use out-of-bag samples to estimate the R^2 on unseen data (oob_score)', options=[False, True])
sleep_time = st.slider('Sleep time', 0, 3, 0)
# Initiate the model building process
if uploaded_file or example_data:
with st.status("Running ...", expanded=True) as status:
st.write("Loading data ...")
time.sleep(sleep_time)
st.write("Preparing data ...")
time.sleep(sleep_time)
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
st.write("Splitting data ...")
time.sleep(sleep_time)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(100-parameter_split_size)/100, random_state=parameter_random_state)
st.write("Model training ...")
time.sleep(sleep_time)
if parameter_max_features == 'all':
parameter_max_features = None
parameter_max_features_metric = X.shape[1]
rf = RandomForestRegressor(
n_estimators=parameter_n_estimators,
max_features=parameter_max_features,
min_samples_split=parameter_min_samples_split,
min_samples_leaf=parameter_min_samples_leaf,
random_state=parameter_random_state,
criterion=parameter_criterion,
bootstrap=parameter_bootstrap,
oob_score=parameter_oob_score)
rf.fit(X_train, y_train)
st.write("Applying model to make predictions ...")
time.sleep(sleep_time)
y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)
st.write("Evaluating performance metrics ...")
time.sleep(sleep_time)
train_mse = mean_squared_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)
st.write("Displaying performance metrics ...")
time.sleep(sleep_time)
parameter_criterion_string = ' '.join([x.capitalize() for x in parameter_criterion.split('_')])
rf_results = pd.DataFrame(['Random forest', train_mse, train_r2, test_mse, test_r2]).transpose()
rf_results.columns = ['Method', f'Training {parameter_criterion_string}', 'Training R2', f'Test {parameter_criterion_string}', 'Test R2']
rf_results = rf_results.round(3)
status.update(label="Status", state="complete", expanded=False)
# Display data info
st.header('Input data', divider='rainbow')
col = st.columns(4)
col[0].metric(label="No. of samples", value=X.shape[0], delta="")
col[1].metric(label="No. of X variables", value=X.shape[1], delta="")
col[2].metric(label="No. of Training samples", value=X_train.shape[0], delta="")
col[3].metric(label="No. of Test samples", value=X_test.shape[0], delta="")
with st.expander('Initial dataset', expanded=True):
st.dataframe(df, height=210, use_container_width=True)
with st.expander('Train split', expanded=False):
train_col = st.columns((3,1))
with train_col[0]:
st.markdown('**X**')
st.dataframe(X_train, height=210, hide_index=True, use_container_width=True)
with train_col[1]:
st.markdown('**y**')
st.dataframe(y_train, height=210, hide_index=True, use_container_width=True)
with st.expander('Test split', expanded=False):
test_col = st.columns((3,1))
with test_col[0]:
st.markdown('**X**')
st.dataframe(X_test, height=210, hide_index=True, use_container_width=True)
with test_col[1]:
st.markdown('**y**')
st.dataframe(y_test, height=210, hide_index=True, use_container_width=True)
# Zip dataset files
df.to_csv('dataset.csv', index=False)
X_train.to_csv('X_train.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_test.to_csv('y_test.csv', index=False)
list_files = ['dataset.csv', 'X_train.csv', 'y_train.csv', 'X_test.csv', 'y_test.csv']
with zipfile.ZipFile('dataset.zip', 'w') as zipF:
for file in list_files:
zipF.write(file, compress_type=zipfile.ZIP_DEFLATED)
with open('dataset.zip', 'rb') as datazip:
btn = st.download_button(
label='Download ZIP',
data=datazip,
file_name="dataset.zip",
mime="application/octet-stream"
)
# Display model parameters
st.header('Model parameters', divider='rainbow')
parameters_col = st.columns(3)
parameters_col[0].metric(label="Data split ratio (% for Training Set)", value=parameter_split_size, delta="")
parameters_col[1].metric(label="Number of estimators (n_estimators)", value=parameter_n_estimators, delta="")
parameters_col[2].metric(label="Max features (max_features)", value=parameter_max_features_metric, delta="")
# Display feature importance plot
importances = rf.feature_importances_
feature_names = list(X.columns)
forest_importances = pd.Series(importances, index=feature_names)
df_importance = forest_importances.reset_index().rename(columns={'index': 'feature', 0: 'value'})
bars = alt.Chart(df_importance).mark_bar(size=40).encode(
x='value:Q',
y=alt.Y('feature:N', sort='-x')
).properties(height=250)
performance_col = st.columns((2, 0.2, 3))
with performance_col[0]:
st.header('Model performance', divider='rainbow')
st.dataframe(rf_results.T.reset_index().rename(columns={'index': 'Parameter', 0: 'Value'}))
with performance_col[2]:
st.header('Feature importance', divider='rainbow')
st.altair_chart(bars, theme='streamlit', use_container_width=True)
# Prediction results
st.header('Prediction results', divider='rainbow')
s_y_train = pd.Series(y_train, name='actual').reset_index(drop=True)
s_y_train_pred = pd.Series(y_train_pred, name='predicted').reset_index(drop=True)
df_train = pd.DataFrame(data=[s_y_train, s_y_train_pred], index=None).T
df_train['class'] = 'train'
s_y_test = pd.Series(y_test, name='actual').reset_index(drop=True)
s_y_test_pred = pd.Series(y_test_pred, name='predicted').reset_index(drop=True)
df_test = pd.DataFrame(data=[s_y_test, s_y_test_pred], index=None).T
df_test['class'] = 'test'
df_prediction = pd.concat([df_train, df_test], axis=0)
prediction_col = st.columns((2, 0.2, 3))
# Display dataframe
with prediction_col[0]:
st.dataframe(df_prediction, height=320, use_container_width=True)
# Display scatter plot of actual vs predicted values
with prediction_col[2]:
scatter = alt.Chart(df_prediction).mark_circle(size=60).encode(
x='actual',
y='predicted',
color='class'
)
st.altair_chart(scatter, theme='streamlit', use_container_width=True)
else:
st.warning('π Upload a CSV file or click *"Load example data"* to get started!')
|