Tedvalson commited on
Commit
6ac343d
Β·
1 Parent(s): d071810

Add some test stuff

Browse files
Files changed (2) hide show
  1. app.py +61 -21
  2. requirements.txt +3 -0
app.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
@@ -7,6 +9,7 @@ from sklearn.metrics import mean_squared_error, r2_score
7
  import altair as alt
8
  import time
9
  import zipfile
 
10
 
11
  # Page title
12
  st.set_page_config(page_title='ML Model Building', page_icon='πŸ€–')
@@ -23,7 +26,7 @@ with st.expander('About this app'):
23
  st.markdown('Data sets:')
24
  st.code('''- Drug solubility data set
25
  ''', language='markdown')
26
-
27
  st.markdown('Libraries used:')
28
  st.code('''- Pandas for data wrangling
29
  - Scikit-learn for building a machine learning model
@@ -41,7 +44,7 @@ with st.sidebar:
41
  uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
42
  if uploaded_file is not None:
43
  df = pd.read_csv(uploaded_file, index_col=False)
44
-
45
  # Download example data
46
  @st.cache_data
47
  def convert_df(input_df):
@@ -81,9 +84,9 @@ with st.sidebar:
81
  sleep_time = st.slider('Sleep time', 0, 3, 0)
82
 
83
  # Initiate the model building process
84
- if uploaded_file or example_data:
85
  with st.status("Running ...", expanded=True) as status:
86
-
87
  st.write("Loading data ...")
88
  time.sleep(sleep_time)
89
 
@@ -91,18 +94,18 @@ if uploaded_file or example_data:
91
  time.sleep(sleep_time)
92
  X = df.iloc[:,:-1]
93
  y = df.iloc[:,-1]
94
-
95
  st.write("Splitting data ...")
96
  time.sleep(sleep_time)
97
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(100-parameter_split_size)/100, random_state=parameter_random_state)
98
-
99
  st.write("Model training ...")
100
  time.sleep(sleep_time)
101
 
102
  if parameter_max_features == 'all':
103
  parameter_max_features = None
104
  parameter_max_features_metric = X.shape[1]
105
-
106
  rf = RandomForestRegressor(
107
  n_estimators=parameter_n_estimators,
108
  max_features=parameter_max_features,
@@ -113,19 +116,19 @@ if uploaded_file or example_data:
113
  bootstrap=parameter_bootstrap,
114
  oob_score=parameter_oob_score)
115
  rf.fit(X_train, y_train)
116
-
117
  st.write("Applying model to make predictions ...")
118
  time.sleep(sleep_time)
119
  y_train_pred = rf.predict(X_train)
120
  y_test_pred = rf.predict(X_test)
121
-
122
  st.write("Evaluating performance metrics ...")
123
  time.sleep(sleep_time)
124
  train_mse = mean_squared_error(y_train, y_train_pred)
125
  train_r2 = r2_score(y_train, y_train_pred)
126
  test_mse = mean_squared_error(y_test, y_test_pred)
127
  test_r2 = r2_score(y_test, y_test_pred)
128
-
129
  st.write("Displaying performance metrics ...")
130
  time.sleep(sleep_time)
131
  parameter_criterion_string = ' '.join([x.capitalize() for x in parameter_criterion.split('_')])
@@ -138,7 +141,7 @@ if uploaded_file or example_data:
138
  rf_results[col] = pd.to_numeric(rf_results[col], errors='ignore')
139
  # Round to 3 digits
140
  rf_results = rf_results.round(3)
141
-
142
  status.update(label="Status", state="complete", expanded=False)
143
 
144
  # Display data info
@@ -148,7 +151,7 @@ if uploaded_file or example_data:
148
  col[1].metric(label="No. of X variables", value=X.shape[1], delta="")
149
  col[2].metric(label="No. of Training samples", value=X_train.shape[0], delta="")
150
  col[3].metric(label="No. of Test samples", value=X_test.shape[0], delta="")
151
-
152
  with st.expander('Initial dataset', expanded=True):
153
  st.dataframe(df, height=210, use_container_width=True)
154
  with st.expander('Train split', expanded=False):
@@ -174,7 +177,7 @@ if uploaded_file or example_data:
174
  y_train.to_csv('y_train.csv', index=False)
175
  X_test.to_csv('X_test.csv', index=False)
176
  y_test.to_csv('y_test.csv', index=False)
177
-
178
  list_files = ['dataset.csv', 'X_train.csv', 'y_train.csv', 'X_test.csv', 'y_test.csv']
179
  with zipfile.ZipFile('dataset.zip', 'w') as zipF:
180
  for file in list_files:
@@ -187,20 +190,20 @@ if uploaded_file or example_data:
187
  file_name="dataset.zip",
188
  mime="application/octet-stream"
189
  )
190
-
191
  # Display model parameters
192
  st.header('Model parameters', divider='rainbow')
193
  parameters_col = st.columns(3)
194
  parameters_col[0].metric(label="Data split ratio (% for Training Set)", value=parameter_split_size, delta="")
195
  parameters_col[1].metric(label="Number of estimators (n_estimators)", value=parameter_n_estimators, delta="")
196
  parameters_col[2].metric(label="Max features (max_features)", value=parameter_max_features_metric, delta="")
197
-
198
  # Display feature importance plot
199
  importances = rf.feature_importances_
200
  feature_names = list(X.columns)
201
  forest_importances = pd.Series(importances, index=feature_names)
202
  df_importance = forest_importances.reset_index().rename(columns={'index': 'feature', 0: 'value'})
203
-
204
  bars = alt.Chart(df_importance).mark_bar(size=40).encode(
205
  x='value:Q',
206
  y=alt.Y('feature:N', sort='-x')
@@ -220,16 +223,16 @@ if uploaded_file or example_data:
220
  s_y_train_pred = pd.Series(y_train_pred, name='predicted').reset_index(drop=True)
221
  df_train = pd.DataFrame(data=[s_y_train, s_y_train_pred], index=None).T
222
  df_train['class'] = 'train'
223
-
224
  s_y_test = pd.Series(y_test, name='actual').reset_index(drop=True)
225
  s_y_test_pred = pd.Series(y_test_pred, name='predicted').reset_index(drop=True)
226
  df_test = pd.DataFrame(data=[s_y_test, s_y_test_pred], index=None).T
227
  df_test['class'] = 'test'
228
-
229
  df_prediction = pd.concat([df_train, df_test], axis=0)
230
-
231
  prediction_col = st.columns((2, 0.2, 3))
232
-
233
  # Display dataframe
234
  with prediction_col[0]:
235
  st.dataframe(df_prediction, height=320, use_container_width=True)
@@ -243,7 +246,44 @@ if uploaded_file or example_data:
243
  )
244
  st.altair_chart(scatter, theme='streamlit', use_container_width=True)
245
 
246
-
247
  # Ask for CSV upload if none is detected
248
  else:
249
  st.warning('πŸ‘ˆ Upload a CSV file or click *"Load example data"* to get started!')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
2
+ from datasets import load_dataset
3
  import streamlit as st
4
  import pandas as pd
5
  import numpy as np
 
9
  import altair as alt
10
  import time
11
  import zipfile
12
+ import fitz
13
 
14
  # Page title
15
  st.set_page_config(page_title='ML Model Building', page_icon='πŸ€–')
 
26
  st.markdown('Data sets:')
27
  st.code('''- Drug solubility data set
28
  ''', language='markdown')
29
+
30
  st.markdown('Libraries used:')
31
  st.code('''- Pandas for data wrangling
32
  - Scikit-learn for building a machine learning model
 
44
  uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
45
  if uploaded_file is not None:
46
  df = pd.read_csv(uploaded_file, index_col=False)
47
+
48
  # Download example data
49
  @st.cache_data
50
  def convert_df(input_df):
 
84
  sleep_time = st.slider('Sleep time', 0, 3, 0)
85
 
86
  # Initiate the model building process
87
+ if uploaded_file or example_data:
88
  with st.status("Running ...", expanded=True) as status:
89
+
90
  st.write("Loading data ...")
91
  time.sleep(sleep_time)
92
 
 
94
  time.sleep(sleep_time)
95
  X = df.iloc[:,:-1]
96
  y = df.iloc[:,-1]
97
+
98
  st.write("Splitting data ...")
99
  time.sleep(sleep_time)
100
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(100-parameter_split_size)/100, random_state=parameter_random_state)
101
+
102
  st.write("Model training ...")
103
  time.sleep(sleep_time)
104
 
105
  if parameter_max_features == 'all':
106
  parameter_max_features = None
107
  parameter_max_features_metric = X.shape[1]
108
+
109
  rf = RandomForestRegressor(
110
  n_estimators=parameter_n_estimators,
111
  max_features=parameter_max_features,
 
116
  bootstrap=parameter_bootstrap,
117
  oob_score=parameter_oob_score)
118
  rf.fit(X_train, y_train)
119
+
120
  st.write("Applying model to make predictions ...")
121
  time.sleep(sleep_time)
122
  y_train_pred = rf.predict(X_train)
123
  y_test_pred = rf.predict(X_test)
124
+
125
  st.write("Evaluating performance metrics ...")
126
  time.sleep(sleep_time)
127
  train_mse = mean_squared_error(y_train, y_train_pred)
128
  train_r2 = r2_score(y_train, y_train_pred)
129
  test_mse = mean_squared_error(y_test, y_test_pred)
130
  test_r2 = r2_score(y_test, y_test_pred)
131
+
132
  st.write("Displaying performance metrics ...")
133
  time.sleep(sleep_time)
134
  parameter_criterion_string = ' '.join([x.capitalize() for x in parameter_criterion.split('_')])
 
141
  rf_results[col] = pd.to_numeric(rf_results[col], errors='ignore')
142
  # Round to 3 digits
143
  rf_results = rf_results.round(3)
144
+
145
  status.update(label="Status", state="complete", expanded=False)
146
 
147
  # Display data info
 
151
  col[1].metric(label="No. of X variables", value=X.shape[1], delta="")
152
  col[2].metric(label="No. of Training samples", value=X_train.shape[0], delta="")
153
  col[3].metric(label="No. of Test samples", value=X_test.shape[0], delta="")
154
+
155
  with st.expander('Initial dataset', expanded=True):
156
  st.dataframe(df, height=210, use_container_width=True)
157
  with st.expander('Train split', expanded=False):
 
177
  y_train.to_csv('y_train.csv', index=False)
178
  X_test.to_csv('X_test.csv', index=False)
179
  y_test.to_csv('y_test.csv', index=False)
180
+
181
  list_files = ['dataset.csv', 'X_train.csv', 'y_train.csv', 'X_test.csv', 'y_test.csv']
182
  with zipfile.ZipFile('dataset.zip', 'w') as zipF:
183
  for file in list_files:
 
190
  file_name="dataset.zip",
191
  mime="application/octet-stream"
192
  )
193
+
194
  # Display model parameters
195
  st.header('Model parameters', divider='rainbow')
196
  parameters_col = st.columns(3)
197
  parameters_col[0].metric(label="Data split ratio (% for Training Set)", value=parameter_split_size, delta="")
198
  parameters_col[1].metric(label="Number of estimators (n_estimators)", value=parameter_n_estimators, delta="")
199
  parameters_col[2].metric(label="Max features (max_features)", value=parameter_max_features_metric, delta="")
200
+
201
  # Display feature importance plot
202
  importances = rf.feature_importances_
203
  feature_names = list(X.columns)
204
  forest_importances = pd.Series(importances, index=feature_names)
205
  df_importance = forest_importances.reset_index().rename(columns={'index': 'feature', 0: 'value'})
206
+
207
  bars = alt.Chart(df_importance).mark_bar(size=40).encode(
208
  x='value:Q',
209
  y=alt.Y('feature:N', sort='-x')
 
223
  s_y_train_pred = pd.Series(y_train_pred, name='predicted').reset_index(drop=True)
224
  df_train = pd.DataFrame(data=[s_y_train, s_y_train_pred], index=None).T
225
  df_train['class'] = 'train'
226
+
227
  s_y_test = pd.Series(y_test, name='actual').reset_index(drop=True)
228
  s_y_test_pred = pd.Series(y_test_pred, name='predicted').reset_index(drop=True)
229
  df_test = pd.DataFrame(data=[s_y_test, s_y_test_pred], index=None).T
230
  df_test['class'] = 'test'
231
+
232
  df_prediction = pd.concat([df_train, df_test], axis=0)
233
+
234
  prediction_col = st.columns((2, 0.2, 3))
235
+
236
  # Display dataframe
237
  with prediction_col[0]:
238
  st.dataframe(df_prediction, height=320, use_container_width=True)
 
246
  )
247
  st.altair_chart(scatter, theme='streamlit', use_container_width=True)
248
 
249
+
250
  # Ask for CSV upload if none is detected
251
  else:
252
  st.warning('πŸ‘ˆ Upload a CSV file or click *"Load example data"* to get started!')
253
+
254
+
255
+ @st.cache_data
256
+ def predict():
257
+ tokenizer = AutoTokenizer.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")
258
+ model = AutoModelForSequenceClassification.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")
259
+
260
+ dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
261
+ example = dataset[0]
262
+ words = example["tokens"]
263
+ boxes = example["bboxes"]
264
+
265
+ encoding = tokenizer(words, boxes=boxes, return_tensors="pt")
266
+
267
+ outputs = model(**encoding)
268
+ predicted_class_idx = outputs.logits.argmax(-1).item()
269
+ predicted_class = model.config.id2label[predicted_class_idx]
270
+ return predicted_class
271
+
272
+ @st.cache_data
273
+ def get_page_count(file_name):
274
+ doc = fitz.open(stream=file_name.read())
275
+ return doc.page_count
276
+
277
+ st.markdown('**1. Upload PDF**')
278
+ doc_file = st.file_uploader("Upload a PDF document", type=["pdf"])
279
+
280
+ st.markdown('**2. Select Page**')
281
+ if doc_file is not None:
282
+ page_count = get_page_count(doc_file)
283
+ page_num = st.slider('Page number', 1, page_count, 1, 1)
284
+ st.markdown(f'Page: :green[{page_num}] / {page_count}')
285
+
286
+ if st.button('Test', type="primary"):
287
+ with st.spinner('Loading...'):
288
+ prediction = predict()
289
+ st.write(prediction)
requirements.txt CHANGED
@@ -1,4 +1,7 @@
1
  streamlit==1.29.0
 
2
  pandas>=1.3.0
3
  scikit-learn
4
  altair>=4.0
 
 
 
1
  streamlit==1.29.0
2
+ PyMuPDF==1.23.8
3
  pandas>=1.3.0
4
  scikit-learn
5
  altair>=4.0
6
+ transformers[torch]
7
+ datasets