File size: 12,405 Bytes
6ac343d d071810 6ac343d d071810 6ac343d d071810 6ac343d d071810 6ac343d d071810 6ac343d d071810 6ac343d d071810 6ac343d d071810 6ac343d d071810 6ac343d d071810 6ac343d d071810 6ac343d d071810 6ac343d d071810 6ac343d d071810 6ac343d d071810 6ac343d d071810 6ac343d d071810 6ac343d d071810 6ac343d d071810 6ac343d d071810 6ac343d d071810 6ac343d d071810 6ac343d d071810 6ac343d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import altair as alt
import time
import zipfile
import fitz
# Page title
st.set_page_config(page_title='ML Model Building', page_icon='π€')
st.title('π€ ML Model Building')
with st.expander('About this app'):
st.markdown('**What can this app do?**')
st.info('This app allow users to build a machine learning (ML) model in an end-to-end workflow. Particularly, this encompasses data upload, data pre-processing, ML model building and post-model analysis.')
st.markdown('**How to use the app?**')
st.warning('To engage with the app, go to the sidebar and 1. Select a data set and 2. Adjust the model parameters by adjusting the various slider widgets. As a result, this would initiate the ML model building process, display the model results as well as allowing users to download the generated models and accompanying data.')
st.markdown('**Under the hood**')
st.markdown('Data sets:')
st.code('''- Drug solubility data set
''', language='markdown')
st.markdown('Libraries used:')
st.code('''- Pandas for data wrangling
- Scikit-learn for building a machine learning model
- Altair for chart creation
- Streamlit for user interface
''', language='markdown')
# Sidebar for accepting input parameters
with st.sidebar:
# Load data
st.header('1.1. Input data')
st.markdown('**1. Use custom data**')
uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
if uploaded_file is not None:
df = pd.read_csv(uploaded_file, index_col=False)
# Download example data
@st.cache_data
def convert_df(input_df):
return input_df.to_csv(index=False).encode('utf-8')
example_csv = pd.read_csv('https://raw.githubusercontent.com/dataprofessor/data/master/delaney_solubility_with_descriptors.csv')
csv = convert_df(example_csv)
st.download_button(
label="Download example CSV",
data=csv,
file_name='delaney_solubility_with_descriptors.csv',
mime='text/csv',
)
# Select example data
st.markdown('**1.2. Use example data**')
example_data = st.toggle('Load example data')
if example_data:
df = pd.read_csv('https://raw.githubusercontent.com/dataprofessor/data/master/delaney_solubility_with_descriptors.csv')
st.header('2. Set Parameters')
parameter_split_size = st.slider('Data split ratio (% for Training Set)', 10, 90, 80, 5)
st.subheader('2.1. Learning Parameters')
with st.expander('See parameters'):
parameter_n_estimators = st.slider('Number of estimators (n_estimators)', 0, 1000, 100, 100)
parameter_max_features = st.select_slider('Max features (max_features)', options=['all', 'sqrt', 'log2'])
parameter_min_samples_split = st.slider('Minimum number of samples required to split an internal node (min_samples_split)', 2, 10, 2, 1)
parameter_min_samples_leaf = st.slider('Minimum number of samples required to be at a leaf node (min_samples_leaf)', 1, 10, 2, 1)
st.subheader('2.2. General Parameters')
with st.expander('See parameters', expanded=False):
parameter_random_state = st.slider('Seed number (random_state)', 0, 1000, 42, 1)
parameter_criterion = st.select_slider('Performance measure (criterion)', options=['squared_error', 'absolute_error', 'friedman_mse'])
parameter_bootstrap = st.select_slider('Bootstrap samples when building trees (bootstrap)', options=[True, False])
parameter_oob_score = st.select_slider('Whether to use out-of-bag samples to estimate the R^2 on unseen data (oob_score)', options=[False, True])
sleep_time = st.slider('Sleep time', 0, 3, 0)
# Initiate the model building process
if uploaded_file or example_data:
with st.status("Running ...", expanded=True) as status:
st.write("Loading data ...")
time.sleep(sleep_time)
st.write("Preparing data ...")
time.sleep(sleep_time)
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
st.write("Splitting data ...")
time.sleep(sleep_time)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(100-parameter_split_size)/100, random_state=parameter_random_state)
st.write("Model training ...")
time.sleep(sleep_time)
if parameter_max_features == 'all':
parameter_max_features = None
parameter_max_features_metric = X.shape[1]
rf = RandomForestRegressor(
n_estimators=parameter_n_estimators,
max_features=parameter_max_features,
min_samples_split=parameter_min_samples_split,
min_samples_leaf=parameter_min_samples_leaf,
random_state=parameter_random_state,
criterion=parameter_criterion,
bootstrap=parameter_bootstrap,
oob_score=parameter_oob_score)
rf.fit(X_train, y_train)
st.write("Applying model to make predictions ...")
time.sleep(sleep_time)
y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)
st.write("Evaluating performance metrics ...")
time.sleep(sleep_time)
train_mse = mean_squared_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)
st.write("Displaying performance metrics ...")
time.sleep(sleep_time)
parameter_criterion_string = ' '.join([x.capitalize() for x in parameter_criterion.split('_')])
#if 'Mse' in parameter_criterion_string:
# parameter_criterion_string = parameter_criterion_string.replace('Mse', 'MSE')
rf_results = pd.DataFrame(['Random forest', train_mse, train_r2, test_mse, test_r2]).transpose()
rf_results.columns = ['Method', f'Training {parameter_criterion_string}', 'Training R2', f'Test {parameter_criterion_string}', 'Test R2']
# Convert objects to numerics
for col in rf_results.columns:
rf_results[col] = pd.to_numeric(rf_results[col], errors='ignore')
# Round to 3 digits
rf_results = rf_results.round(3)
status.update(label="Status", state="complete", expanded=False)
# Display data info
st.header('Input data', divider='rainbow')
col = st.columns(4)
col[0].metric(label="No. of samples", value=X.shape[0], delta="")
col[1].metric(label="No. of X variables", value=X.shape[1], delta="")
col[2].metric(label="No. of Training samples", value=X_train.shape[0], delta="")
col[3].metric(label="No. of Test samples", value=X_test.shape[0], delta="")
with st.expander('Initial dataset', expanded=True):
st.dataframe(df, height=210, use_container_width=True)
with st.expander('Train split', expanded=False):
train_col = st.columns((3,1))
with train_col[0]:
st.markdown('**X**')
st.dataframe(X_train, height=210, hide_index=True, use_container_width=True)
with train_col[1]:
st.markdown('**y**')
st.dataframe(y_train, height=210, hide_index=True, use_container_width=True)
with st.expander('Test split', expanded=False):
test_col = st.columns((3,1))
with test_col[0]:
st.markdown('**X**')
st.dataframe(X_test, height=210, hide_index=True, use_container_width=True)
with test_col[1]:
st.markdown('**y**')
st.dataframe(y_test, height=210, hide_index=True, use_container_width=True)
# Zip dataset files
df.to_csv('dataset.csv', index=False)
X_train.to_csv('X_train.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_test.to_csv('y_test.csv', index=False)
list_files = ['dataset.csv', 'X_train.csv', 'y_train.csv', 'X_test.csv', 'y_test.csv']
with zipfile.ZipFile('dataset.zip', 'w') as zipF:
for file in list_files:
zipF.write(file, compress_type=zipfile.ZIP_DEFLATED)
with open('dataset.zip', 'rb') as datazip:
btn = st.download_button(
label='Download ZIP',
data=datazip,
file_name="dataset.zip",
mime="application/octet-stream"
)
# Display model parameters
st.header('Model parameters', divider='rainbow')
parameters_col = st.columns(3)
parameters_col[0].metric(label="Data split ratio (% for Training Set)", value=parameter_split_size, delta="")
parameters_col[1].metric(label="Number of estimators (n_estimators)", value=parameter_n_estimators, delta="")
parameters_col[2].metric(label="Max features (max_features)", value=parameter_max_features_metric, delta="")
# Display feature importance plot
importances = rf.feature_importances_
feature_names = list(X.columns)
forest_importances = pd.Series(importances, index=feature_names)
df_importance = forest_importances.reset_index().rename(columns={'index': 'feature', 0: 'value'})
bars = alt.Chart(df_importance).mark_bar(size=40).encode(
x='value:Q',
y=alt.Y('feature:N', sort='-x')
).properties(height=250)
performance_col = st.columns((2, 0.2, 3))
with performance_col[0]:
st.header('Model performance', divider='rainbow')
st.dataframe(rf_results.T.reset_index().rename(columns={'index': 'Parameter', 0: 'Value'}))
with performance_col[2]:
st.header('Feature importance', divider='rainbow')
st.altair_chart(bars, theme='streamlit', use_container_width=True)
# Prediction results
st.header('Prediction results', divider='rainbow')
s_y_train = pd.Series(y_train, name='actual').reset_index(drop=True)
s_y_train_pred = pd.Series(y_train_pred, name='predicted').reset_index(drop=True)
df_train = pd.DataFrame(data=[s_y_train, s_y_train_pred], index=None).T
df_train['class'] = 'train'
s_y_test = pd.Series(y_test, name='actual').reset_index(drop=True)
s_y_test_pred = pd.Series(y_test_pred, name='predicted').reset_index(drop=True)
df_test = pd.DataFrame(data=[s_y_test, s_y_test_pred], index=None).T
df_test['class'] = 'test'
df_prediction = pd.concat([df_train, df_test], axis=0)
prediction_col = st.columns((2, 0.2, 3))
# Display dataframe
with prediction_col[0]:
st.dataframe(df_prediction, height=320, use_container_width=True)
# Display scatter plot of actual vs predicted values
with prediction_col[2]:
scatter = alt.Chart(df_prediction).mark_circle(size=60).encode(
x='actual',
y='predicted',
color='class'
)
st.altair_chart(scatter, theme='streamlit', use_container_width=True)
# Ask for CSV upload if none is detected
else:
st.warning('π Upload a CSV file or click *"Load example data"* to get started!')
@st.cache_data
def predict():
tokenizer = AutoTokenizer.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")
model = AutoModelForSequenceClassification.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")
dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
example = dataset[0]
words = example["tokens"]
boxes = example["bboxes"]
encoding = tokenizer(words, boxes=boxes, return_tensors="pt")
outputs = model(**encoding)
predicted_class_idx = outputs.logits.argmax(-1).item()
predicted_class = model.config.id2label[predicted_class_idx]
return predicted_class
@st.cache_data
def get_page_count(file_name):
doc = fitz.open(stream=file_name.read())
return doc.page_count
st.markdown('**1. Upload PDF**')
doc_file = st.file_uploader("Upload a PDF document", type=["pdf"])
st.markdown('**2. Select Page**')
if doc_file is not None:
page_count = get_page_count(doc_file)
page_num = st.slider('Page number', 1, page_count, 1, 1)
st.markdown(f'Page: :green[{page_num}] / {page_count}')
if st.button('Test', type="primary"):
with st.spinner('Loading...'):
prediction = predict()
st.write(prediction)
|