Spaces:

Tedvalson
/

LiLT

Sleeping

App Files Files Community

Tedvalson commited on Jun 8, 2024

Commit

6ac343d

1 Parent(s): d071810

Add some test stuff

Browse files

Files changed (2) hide show

app.py +61 -21
requirements.txt +3 -0

app.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import streamlit as st
 import pandas as pd
 import numpy as np
@@ -7,6 +9,7 @@ from sklearn.metrics import mean_squared_error, r2_score
 import altair as alt
 import time
 import zipfile
 # Page title
 st.set_page_config(page_title='ML Model Building', page_icon='🤖')
@@ -23,7 +26,7 @@ with st.expander('About this app'):
   st.markdown('Data sets:')
   st.code('''- Drug solubility data set
   ''', language='markdown')
   st.markdown('Libraries used:')
   st.code('''- Pandas for data wrangling
 - Scikit-learn for building a machine learning model
@@ -41,7 +44,7 @@ with st.sidebar:
     uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
     if uploaded_file is not None:
         df = pd.read_csv(uploaded_file, index_col=False)
     # Download example data
     @st.cache_data
     def convert_df(input_df):
@@ -81,9 +84,9 @@ with st.sidebar:
     sleep_time = st.slider('Sleep time', 0, 3, 0)
 # Initiate the model building process
-if uploaded_file or example_data:
     with st.status("Running ...", expanded=True) as status:
         st.write("Loading data ...")
         time.sleep(sleep_time)
@@ -91,18 +94,18 @@ if uploaded_file or example_data:
         time.sleep(sleep_time)
         X = df.iloc[:,:-1]
         y = df.iloc[:,-1]
         st.write("Splitting data ...")
         time.sleep(sleep_time)
         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(100-parameter_split_size)/100, random_state=parameter_random_state)
         st.write("Model training ...")
         time.sleep(sleep_time)
         if parameter_max_features == 'all':
             parameter_max_features = None
             parameter_max_features_metric = X.shape[1]
         rf = RandomForestRegressor(
                 n_estimators=parameter_n_estimators,
                 max_features=parameter_max_features,
@@ -113,19 +116,19 @@ if uploaded_file or example_data:
                 bootstrap=parameter_bootstrap,
                 oob_score=parameter_oob_score)
         rf.fit(X_train, y_train)
         st.write("Applying model to make predictions ...")
         time.sleep(sleep_time)
         y_train_pred = rf.predict(X_train)
         y_test_pred = rf.predict(X_test)
         st.write("Evaluating performance metrics ...")
         time.sleep(sleep_time)
         train_mse = mean_squared_error(y_train, y_train_pred)
         train_r2 = r2_score(y_train, y_train_pred)
         test_mse = mean_squared_error(y_test, y_test_pred)
         test_r2 = r2_score(y_test, y_test_pred)
         st.write("Displaying performance metrics ...")
         time.sleep(sleep_time)
         parameter_criterion_string = ' '.join([x.capitalize() for x in parameter_criterion.split('_')])
@@ -138,7 +141,7 @@ if uploaded_file or example_data:
             rf_results[col] = pd.to_numeric(rf_results[col], errors='ignore')
         # Round to 3 digits
         rf_results = rf_results.round(3)
     status.update(label="Status", state="complete", expanded=False)
     # Display data info
@@ -148,7 +151,7 @@ if uploaded_file or example_data:
     col[1].metric(label="No. of X variables", value=X.shape[1], delta="")
     col[2].metric(label="No. of Training samples", value=X_train.shape[0], delta="")
     col[3].metric(label="No. of Test samples", value=X_test.shape[0], delta="")
     with st.expander('Initial dataset', expanded=True):
         st.dataframe(df, height=210, use_container_width=True)
     with st.expander('Train split', expanded=False):
@@ -174,7 +177,7 @@ if uploaded_file or example_data:
     y_train.to_csv('y_train.csv', index=False)
     X_test.to_csv('X_test.csv', index=False)
     y_test.to_csv('y_test.csv', index=False)
     list_files = ['dataset.csv', 'X_train.csv', 'y_train.csv', 'X_test.csv', 'y_test.csv']
     with zipfile.ZipFile('dataset.zip', 'w') as zipF:
         for file in list_files:
@@ -187,20 +190,20 @@ if uploaded_file or example_data:
                 file_name="dataset.zip",
                 mime="application/octet-stream"
                 )
     # Display model parameters
     st.header('Model parameters', divider='rainbow')
     parameters_col = st.columns(3)
     parameters_col[0].metric(label="Data split ratio (% for Training Set)", value=parameter_split_size, delta="")
     parameters_col[1].metric(label="Number of estimators (n_estimators)", value=parameter_n_estimators, delta="")
     parameters_col[2].metric(label="Max features (max_features)", value=parameter_max_features_metric, delta="")
     # Display feature importance plot
     importances = rf.feature_importances_
     feature_names = list(X.columns)
     forest_importances = pd.Series(importances, index=feature_names)
     df_importance = forest_importances.reset_index().rename(columns={'index': 'feature', 0: 'value'})
     bars = alt.Chart(df_importance).mark_bar(size=40).encode(
              x='value:Q',
              y=alt.Y('feature:N', sort='-x')
@@ -220,16 +223,16 @@ if uploaded_file or example_data:
     s_y_train_pred = pd.Series(y_train_pred, name='predicted').reset_index(drop=True)
     df_train = pd.DataFrame(data=[s_y_train, s_y_train_pred], index=None).T
     df_train['class'] = 'train'
     s_y_test = pd.Series(y_test, name='actual').reset_index(drop=True)
     s_y_test_pred = pd.Series(y_test_pred, name='predicted').reset_index(drop=True)
     df_test = pd.DataFrame(data=[s_y_test, s_y_test_pred], index=None).T
     df_test['class'] = 'test'
     df_prediction = pd.concat([df_train, df_test], axis=0)
     prediction_col = st.columns((2, 0.2, 3))
     # Display dataframe
     with prediction_col[0]:
         st.dataframe(df_prediction, height=320, use_container_width=True)
@@ -243,7 +246,44 @@ if uploaded_file or example_data:
                   )
         st.altair_chart(scatter, theme='streamlit', use_container_width=True)
 # Ask for CSV upload if none is detected
 else:
     st.warning('👈 Upload a CSV file or click *"Load example data"* to get started!')

+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from datasets import load_dataset
 import streamlit as st
 import pandas as pd
 import numpy as np
 import altair as alt
 import time
 import zipfile
+import fitz
 # Page title
 st.set_page_config(page_title='ML Model Building', page_icon='🤖')
   st.markdown('Data sets:')
   st.code('''- Drug solubility data set
   ''', language='markdown')
   st.markdown('Libraries used:')
   st.code('''- Pandas for data wrangling
 - Scikit-learn for building a machine learning model
     uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
     if uploaded_file is not None:
         df = pd.read_csv(uploaded_file, index_col=False)
     # Download example data
     @st.cache_data
     def convert_df(input_df):
     sleep_time = st.slider('Sleep time', 0, 3, 0)
 # Initiate the model building process
+if uploaded_file or example_data:
     with st.status("Running ...", expanded=True) as status:
         st.write("Loading data ...")
         time.sleep(sleep_time)
         time.sleep(sleep_time)
         X = df.iloc[:,:-1]
         y = df.iloc[:,-1]
         st.write("Splitting data ...")
         time.sleep(sleep_time)
         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(100-parameter_split_size)/100, random_state=parameter_random_state)
         st.write("Model training ...")
         time.sleep(sleep_time)
         if parameter_max_features == 'all':
             parameter_max_features = None
             parameter_max_features_metric = X.shape[1]
         rf = RandomForestRegressor(
                 n_estimators=parameter_n_estimators,
                 max_features=parameter_max_features,
                 bootstrap=parameter_bootstrap,
                 oob_score=parameter_oob_score)
         rf.fit(X_train, y_train)
         st.write("Applying model to make predictions ...")
         time.sleep(sleep_time)
         y_train_pred = rf.predict(X_train)
         y_test_pred = rf.predict(X_test)
         st.write("Evaluating performance metrics ...")
         time.sleep(sleep_time)
         train_mse = mean_squared_error(y_train, y_train_pred)
         train_r2 = r2_score(y_train, y_train_pred)
         test_mse = mean_squared_error(y_test, y_test_pred)
         test_r2 = r2_score(y_test, y_test_pred)
         st.write("Displaying performance metrics ...")
         time.sleep(sleep_time)
         parameter_criterion_string = ' '.join([x.capitalize() for x in parameter_criterion.split('_')])
             rf_results[col] = pd.to_numeric(rf_results[col], errors='ignore')
         # Round to 3 digits
         rf_results = rf_results.round(3)
     status.update(label="Status", state="complete", expanded=False)
     # Display data info
     col[1].metric(label="No. of X variables", value=X.shape[1], delta="")
     col[2].metric(label="No. of Training samples", value=X_train.shape[0], delta="")
     col[3].metric(label="No. of Test samples", value=X_test.shape[0], delta="")
     with st.expander('Initial dataset', expanded=True):
         st.dataframe(df, height=210, use_container_width=True)
     with st.expander('Train split', expanded=False):
     y_train.to_csv('y_train.csv', index=False)
     X_test.to_csv('X_test.csv', index=False)
     y_test.to_csv('y_test.csv', index=False)
     list_files = ['dataset.csv', 'X_train.csv', 'y_train.csv', 'X_test.csv', 'y_test.csv']
     with zipfile.ZipFile('dataset.zip', 'w') as zipF:
         for file in list_files:
                 file_name="dataset.zip",
                 mime="application/octet-stream"
                 )
     # Display model parameters
     st.header('Model parameters', divider='rainbow')
     parameters_col = st.columns(3)
     parameters_col[0].metric(label="Data split ratio (% for Training Set)", value=parameter_split_size, delta="")
     parameters_col[1].metric(label="Number of estimators (n_estimators)", value=parameter_n_estimators, delta="")
     parameters_col[2].metric(label="Max features (max_features)", value=parameter_max_features_metric, delta="")
     # Display feature importance plot
     importances = rf.feature_importances_
     feature_names = list(X.columns)
     forest_importances = pd.Series(importances, index=feature_names)
     df_importance = forest_importances.reset_index().rename(columns={'index': 'feature', 0: 'value'})
     bars = alt.Chart(df_importance).mark_bar(size=40).encode(
              x='value:Q',
              y=alt.Y('feature:N', sort='-x')
     s_y_train_pred = pd.Series(y_train_pred, name='predicted').reset_index(drop=True)
     df_train = pd.DataFrame(data=[s_y_train, s_y_train_pred], index=None).T
     df_train['class'] = 'train'
     s_y_test = pd.Series(y_test, name='actual').reset_index(drop=True)
     s_y_test_pred = pd.Series(y_test_pred, name='predicted').reset_index(drop=True)
     df_test = pd.DataFrame(data=[s_y_test, s_y_test_pred], index=None).T
     df_test['class'] = 'test'
     df_prediction = pd.concat([df_train, df_test], axis=0)
     prediction_col = st.columns((2, 0.2, 3))
     # Display dataframe
     with prediction_col[0]:
         st.dataframe(df_prediction, height=320, use_container_width=True)
                   )
         st.altair_chart(scatter, theme='streamlit', use_container_width=True)
 # Ask for CSV upload if none is detected
 else:
     st.warning('👈 Upload a CSV file or click *"Load example data"* to get started!')
+@st.cache_data
+def predict():
+	tokenizer = AutoTokenizer.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")
+	model = AutoModelForSequenceClassification.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")
+	dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
+	example = dataset[0]
+	words = example["tokens"]
+	boxes = example["bboxes"]
+	encoding = tokenizer(words, boxes=boxes, return_tensors="pt")
+	outputs = model(**encoding)
+	predicted_class_idx = outputs.logits.argmax(-1).item()
+	predicted_class = model.config.id2label[predicted_class_idx]
+	return predicted_class
+@st.cache_data
+def get_page_count(file_name):
+	doc = fitz.open(stream=file_name.read())
+	return doc.page_count
+st.markdown('**1. Upload PDF**')
+doc_file = st.file_uploader("Upload a PDF document", type=["pdf"])
+st.markdown('**2. Select Page**')
+if doc_file is not None:
+	page_count = get_page_count(doc_file)
+	page_num = st.slider('Page number', 1, page_count, 1, 1)
+	st.markdown(f'Page: :green[{page_num}] / {page_count}')
+if st.button('Test', type="primary"):
+	with st.spinner('Loading...'):
+		prediction = predict()
+	st.write(prediction)

requirements.txt CHANGED Viewed

@@ -1,4 +1,7 @@
 streamlit==1.29.0
 pandas>=1.3.0
 scikit-learn
 altair>=4.0

 streamlit==1.29.0
+PyMuPDF==1.23.8
 pandas>=1.3.0
 scikit-learn
 altair>=4.0
+transformers[torch]
+datasets