Add some test stuff
Browse files- app.py +61 -21
- requirements.txt +3 -0
app.py
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
@@ -7,6 +9,7 @@ from sklearn.metrics import mean_squared_error, r2_score
|
|
7 |
import altair as alt
|
8 |
import time
|
9 |
import zipfile
|
|
|
10 |
|
11 |
# Page title
|
12 |
st.set_page_config(page_title='ML Model Building', page_icon='π€')
|
@@ -23,7 +26,7 @@ with st.expander('About this app'):
|
|
23 |
st.markdown('Data sets:')
|
24 |
st.code('''- Drug solubility data set
|
25 |
''', language='markdown')
|
26 |
-
|
27 |
st.markdown('Libraries used:')
|
28 |
st.code('''- Pandas for data wrangling
|
29 |
- Scikit-learn for building a machine learning model
|
@@ -41,7 +44,7 @@ with st.sidebar:
|
|
41 |
uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
|
42 |
if uploaded_file is not None:
|
43 |
df = pd.read_csv(uploaded_file, index_col=False)
|
44 |
-
|
45 |
# Download example data
|
46 |
@st.cache_data
|
47 |
def convert_df(input_df):
|
@@ -81,9 +84,9 @@ with st.sidebar:
|
|
81 |
sleep_time = st.slider('Sleep time', 0, 3, 0)
|
82 |
|
83 |
# Initiate the model building process
|
84 |
-
if uploaded_file or example_data:
|
85 |
with st.status("Running ...", expanded=True) as status:
|
86 |
-
|
87 |
st.write("Loading data ...")
|
88 |
time.sleep(sleep_time)
|
89 |
|
@@ -91,18 +94,18 @@ if uploaded_file or example_data:
|
|
91 |
time.sleep(sleep_time)
|
92 |
X = df.iloc[:,:-1]
|
93 |
y = df.iloc[:,-1]
|
94 |
-
|
95 |
st.write("Splitting data ...")
|
96 |
time.sleep(sleep_time)
|
97 |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(100-parameter_split_size)/100, random_state=parameter_random_state)
|
98 |
-
|
99 |
st.write("Model training ...")
|
100 |
time.sleep(sleep_time)
|
101 |
|
102 |
if parameter_max_features == 'all':
|
103 |
parameter_max_features = None
|
104 |
parameter_max_features_metric = X.shape[1]
|
105 |
-
|
106 |
rf = RandomForestRegressor(
|
107 |
n_estimators=parameter_n_estimators,
|
108 |
max_features=parameter_max_features,
|
@@ -113,19 +116,19 @@ if uploaded_file or example_data:
|
|
113 |
bootstrap=parameter_bootstrap,
|
114 |
oob_score=parameter_oob_score)
|
115 |
rf.fit(X_train, y_train)
|
116 |
-
|
117 |
st.write("Applying model to make predictions ...")
|
118 |
time.sleep(sleep_time)
|
119 |
y_train_pred = rf.predict(X_train)
|
120 |
y_test_pred = rf.predict(X_test)
|
121 |
-
|
122 |
st.write("Evaluating performance metrics ...")
|
123 |
time.sleep(sleep_time)
|
124 |
train_mse = mean_squared_error(y_train, y_train_pred)
|
125 |
train_r2 = r2_score(y_train, y_train_pred)
|
126 |
test_mse = mean_squared_error(y_test, y_test_pred)
|
127 |
test_r2 = r2_score(y_test, y_test_pred)
|
128 |
-
|
129 |
st.write("Displaying performance metrics ...")
|
130 |
time.sleep(sleep_time)
|
131 |
parameter_criterion_string = ' '.join([x.capitalize() for x in parameter_criterion.split('_')])
|
@@ -138,7 +141,7 @@ if uploaded_file or example_data:
|
|
138 |
rf_results[col] = pd.to_numeric(rf_results[col], errors='ignore')
|
139 |
# Round to 3 digits
|
140 |
rf_results = rf_results.round(3)
|
141 |
-
|
142 |
status.update(label="Status", state="complete", expanded=False)
|
143 |
|
144 |
# Display data info
|
@@ -148,7 +151,7 @@ if uploaded_file or example_data:
|
|
148 |
col[1].metric(label="No. of X variables", value=X.shape[1], delta="")
|
149 |
col[2].metric(label="No. of Training samples", value=X_train.shape[0], delta="")
|
150 |
col[3].metric(label="No. of Test samples", value=X_test.shape[0], delta="")
|
151 |
-
|
152 |
with st.expander('Initial dataset', expanded=True):
|
153 |
st.dataframe(df, height=210, use_container_width=True)
|
154 |
with st.expander('Train split', expanded=False):
|
@@ -174,7 +177,7 @@ if uploaded_file or example_data:
|
|
174 |
y_train.to_csv('y_train.csv', index=False)
|
175 |
X_test.to_csv('X_test.csv', index=False)
|
176 |
y_test.to_csv('y_test.csv', index=False)
|
177 |
-
|
178 |
list_files = ['dataset.csv', 'X_train.csv', 'y_train.csv', 'X_test.csv', 'y_test.csv']
|
179 |
with zipfile.ZipFile('dataset.zip', 'w') as zipF:
|
180 |
for file in list_files:
|
@@ -187,20 +190,20 @@ if uploaded_file or example_data:
|
|
187 |
file_name="dataset.zip",
|
188 |
mime="application/octet-stream"
|
189 |
)
|
190 |
-
|
191 |
# Display model parameters
|
192 |
st.header('Model parameters', divider='rainbow')
|
193 |
parameters_col = st.columns(3)
|
194 |
parameters_col[0].metric(label="Data split ratio (% for Training Set)", value=parameter_split_size, delta="")
|
195 |
parameters_col[1].metric(label="Number of estimators (n_estimators)", value=parameter_n_estimators, delta="")
|
196 |
parameters_col[2].metric(label="Max features (max_features)", value=parameter_max_features_metric, delta="")
|
197 |
-
|
198 |
# Display feature importance plot
|
199 |
importances = rf.feature_importances_
|
200 |
feature_names = list(X.columns)
|
201 |
forest_importances = pd.Series(importances, index=feature_names)
|
202 |
df_importance = forest_importances.reset_index().rename(columns={'index': 'feature', 0: 'value'})
|
203 |
-
|
204 |
bars = alt.Chart(df_importance).mark_bar(size=40).encode(
|
205 |
x='value:Q',
|
206 |
y=alt.Y('feature:N', sort='-x')
|
@@ -220,16 +223,16 @@ if uploaded_file or example_data:
|
|
220 |
s_y_train_pred = pd.Series(y_train_pred, name='predicted').reset_index(drop=True)
|
221 |
df_train = pd.DataFrame(data=[s_y_train, s_y_train_pred], index=None).T
|
222 |
df_train['class'] = 'train'
|
223 |
-
|
224 |
s_y_test = pd.Series(y_test, name='actual').reset_index(drop=True)
|
225 |
s_y_test_pred = pd.Series(y_test_pred, name='predicted').reset_index(drop=True)
|
226 |
df_test = pd.DataFrame(data=[s_y_test, s_y_test_pred], index=None).T
|
227 |
df_test['class'] = 'test'
|
228 |
-
|
229 |
df_prediction = pd.concat([df_train, df_test], axis=0)
|
230 |
-
|
231 |
prediction_col = st.columns((2, 0.2, 3))
|
232 |
-
|
233 |
# Display dataframe
|
234 |
with prediction_col[0]:
|
235 |
st.dataframe(df_prediction, height=320, use_container_width=True)
|
@@ -243,7 +246,44 @@ if uploaded_file or example_data:
|
|
243 |
)
|
244 |
st.altair_chart(scatter, theme='streamlit', use_container_width=True)
|
245 |
|
246 |
-
|
247 |
# Ask for CSV upload if none is detected
|
248 |
else:
|
249 |
st.warning('π Upload a CSV file or click *"Load example data"* to get started!')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
2 |
+
from datasets import load_dataset
|
3 |
import streamlit as st
|
4 |
import pandas as pd
|
5 |
import numpy as np
|
|
|
9 |
import altair as alt
|
10 |
import time
|
11 |
import zipfile
|
12 |
+
import fitz
|
13 |
|
14 |
# Page title
|
15 |
st.set_page_config(page_title='ML Model Building', page_icon='π€')
|
|
|
26 |
st.markdown('Data sets:')
|
27 |
st.code('''- Drug solubility data set
|
28 |
''', language='markdown')
|
29 |
+
|
30 |
st.markdown('Libraries used:')
|
31 |
st.code('''- Pandas for data wrangling
|
32 |
- Scikit-learn for building a machine learning model
|
|
|
44 |
uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
|
45 |
if uploaded_file is not None:
|
46 |
df = pd.read_csv(uploaded_file, index_col=False)
|
47 |
+
|
48 |
# Download example data
|
49 |
@st.cache_data
|
50 |
def convert_df(input_df):
|
|
|
84 |
sleep_time = st.slider('Sleep time', 0, 3, 0)
|
85 |
|
86 |
# Initiate the model building process
|
87 |
+
if uploaded_file or example_data:
|
88 |
with st.status("Running ...", expanded=True) as status:
|
89 |
+
|
90 |
st.write("Loading data ...")
|
91 |
time.sleep(sleep_time)
|
92 |
|
|
|
94 |
time.sleep(sleep_time)
|
95 |
X = df.iloc[:,:-1]
|
96 |
y = df.iloc[:,-1]
|
97 |
+
|
98 |
st.write("Splitting data ...")
|
99 |
time.sleep(sleep_time)
|
100 |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(100-parameter_split_size)/100, random_state=parameter_random_state)
|
101 |
+
|
102 |
st.write("Model training ...")
|
103 |
time.sleep(sleep_time)
|
104 |
|
105 |
if parameter_max_features == 'all':
|
106 |
parameter_max_features = None
|
107 |
parameter_max_features_metric = X.shape[1]
|
108 |
+
|
109 |
rf = RandomForestRegressor(
|
110 |
n_estimators=parameter_n_estimators,
|
111 |
max_features=parameter_max_features,
|
|
|
116 |
bootstrap=parameter_bootstrap,
|
117 |
oob_score=parameter_oob_score)
|
118 |
rf.fit(X_train, y_train)
|
119 |
+
|
120 |
st.write("Applying model to make predictions ...")
|
121 |
time.sleep(sleep_time)
|
122 |
y_train_pred = rf.predict(X_train)
|
123 |
y_test_pred = rf.predict(X_test)
|
124 |
+
|
125 |
st.write("Evaluating performance metrics ...")
|
126 |
time.sleep(sleep_time)
|
127 |
train_mse = mean_squared_error(y_train, y_train_pred)
|
128 |
train_r2 = r2_score(y_train, y_train_pred)
|
129 |
test_mse = mean_squared_error(y_test, y_test_pred)
|
130 |
test_r2 = r2_score(y_test, y_test_pred)
|
131 |
+
|
132 |
st.write("Displaying performance metrics ...")
|
133 |
time.sleep(sleep_time)
|
134 |
parameter_criterion_string = ' '.join([x.capitalize() for x in parameter_criterion.split('_')])
|
|
|
141 |
rf_results[col] = pd.to_numeric(rf_results[col], errors='ignore')
|
142 |
# Round to 3 digits
|
143 |
rf_results = rf_results.round(3)
|
144 |
+
|
145 |
status.update(label="Status", state="complete", expanded=False)
|
146 |
|
147 |
# Display data info
|
|
|
151 |
col[1].metric(label="No. of X variables", value=X.shape[1], delta="")
|
152 |
col[2].metric(label="No. of Training samples", value=X_train.shape[0], delta="")
|
153 |
col[3].metric(label="No. of Test samples", value=X_test.shape[0], delta="")
|
154 |
+
|
155 |
with st.expander('Initial dataset', expanded=True):
|
156 |
st.dataframe(df, height=210, use_container_width=True)
|
157 |
with st.expander('Train split', expanded=False):
|
|
|
177 |
y_train.to_csv('y_train.csv', index=False)
|
178 |
X_test.to_csv('X_test.csv', index=False)
|
179 |
y_test.to_csv('y_test.csv', index=False)
|
180 |
+
|
181 |
list_files = ['dataset.csv', 'X_train.csv', 'y_train.csv', 'X_test.csv', 'y_test.csv']
|
182 |
with zipfile.ZipFile('dataset.zip', 'w') as zipF:
|
183 |
for file in list_files:
|
|
|
190 |
file_name="dataset.zip",
|
191 |
mime="application/octet-stream"
|
192 |
)
|
193 |
+
|
194 |
# Display model parameters
|
195 |
st.header('Model parameters', divider='rainbow')
|
196 |
parameters_col = st.columns(3)
|
197 |
parameters_col[0].metric(label="Data split ratio (% for Training Set)", value=parameter_split_size, delta="")
|
198 |
parameters_col[1].metric(label="Number of estimators (n_estimators)", value=parameter_n_estimators, delta="")
|
199 |
parameters_col[2].metric(label="Max features (max_features)", value=parameter_max_features_metric, delta="")
|
200 |
+
|
201 |
# Display feature importance plot
|
202 |
importances = rf.feature_importances_
|
203 |
feature_names = list(X.columns)
|
204 |
forest_importances = pd.Series(importances, index=feature_names)
|
205 |
df_importance = forest_importances.reset_index().rename(columns={'index': 'feature', 0: 'value'})
|
206 |
+
|
207 |
bars = alt.Chart(df_importance).mark_bar(size=40).encode(
|
208 |
x='value:Q',
|
209 |
y=alt.Y('feature:N', sort='-x')
|
|
|
223 |
s_y_train_pred = pd.Series(y_train_pred, name='predicted').reset_index(drop=True)
|
224 |
df_train = pd.DataFrame(data=[s_y_train, s_y_train_pred], index=None).T
|
225 |
df_train['class'] = 'train'
|
226 |
+
|
227 |
s_y_test = pd.Series(y_test, name='actual').reset_index(drop=True)
|
228 |
s_y_test_pred = pd.Series(y_test_pred, name='predicted').reset_index(drop=True)
|
229 |
df_test = pd.DataFrame(data=[s_y_test, s_y_test_pred], index=None).T
|
230 |
df_test['class'] = 'test'
|
231 |
+
|
232 |
df_prediction = pd.concat([df_train, df_test], axis=0)
|
233 |
+
|
234 |
prediction_col = st.columns((2, 0.2, 3))
|
235 |
+
|
236 |
# Display dataframe
|
237 |
with prediction_col[0]:
|
238 |
st.dataframe(df_prediction, height=320, use_container_width=True)
|
|
|
246 |
)
|
247 |
st.altair_chart(scatter, theme='streamlit', use_container_width=True)
|
248 |
|
249 |
+
|
250 |
# Ask for CSV upload if none is detected
|
251 |
else:
|
252 |
st.warning('π Upload a CSV file or click *"Load example data"* to get started!')
|
253 |
+
|
254 |
+
|
255 |
+
@st.cache_data
|
256 |
+
def predict():
|
257 |
+
tokenizer = AutoTokenizer.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")
|
258 |
+
model = AutoModelForSequenceClassification.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")
|
259 |
+
|
260 |
+
dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
|
261 |
+
example = dataset[0]
|
262 |
+
words = example["tokens"]
|
263 |
+
boxes = example["bboxes"]
|
264 |
+
|
265 |
+
encoding = tokenizer(words, boxes=boxes, return_tensors="pt")
|
266 |
+
|
267 |
+
outputs = model(**encoding)
|
268 |
+
predicted_class_idx = outputs.logits.argmax(-1).item()
|
269 |
+
predicted_class = model.config.id2label[predicted_class_idx]
|
270 |
+
return predicted_class
|
271 |
+
|
272 |
+
@st.cache_data
|
273 |
+
def get_page_count(file_name):
|
274 |
+
doc = fitz.open(stream=file_name.read())
|
275 |
+
return doc.page_count
|
276 |
+
|
277 |
+
st.markdown('**1. Upload PDF**')
|
278 |
+
doc_file = st.file_uploader("Upload a PDF document", type=["pdf"])
|
279 |
+
|
280 |
+
st.markdown('**2. Select Page**')
|
281 |
+
if doc_file is not None:
|
282 |
+
page_count = get_page_count(doc_file)
|
283 |
+
page_num = st.slider('Page number', 1, page_count, 1, 1)
|
284 |
+
st.markdown(f'Page: :green[{page_num}] / {page_count}')
|
285 |
+
|
286 |
+
if st.button('Test', type="primary"):
|
287 |
+
with st.spinner('Loading...'):
|
288 |
+
prediction = predict()
|
289 |
+
st.write(prediction)
|
requirements.txt
CHANGED
@@ -1,4 +1,7 @@
|
|
1 |
streamlit==1.29.0
|
|
|
2 |
pandas>=1.3.0
|
3 |
scikit-learn
|
4 |
altair>=4.0
|
|
|
|
|
|
1 |
streamlit==1.29.0
|
2 |
+
PyMuPDF==1.23.8
|
3 |
pandas>=1.3.0
|
4 |
scikit-learn
|
5 |
altair>=4.0
|
6 |
+
transformers[torch]
|
7 |
+
datasets
|