dperales commited on
Commit
02d2227
·
1 Parent(s): a5daa47

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +247 -206
app.py CHANGED
@@ -14,6 +14,8 @@ from PIL import ImageDraw
14
  from PIL import ImageFont
15
 
16
  def main():
 
 
17
  hide_streamlit_style = """
18
  <style>
19
  #MainMenu {visibility: hidden;}
@@ -57,72 +59,75 @@ def main():
57
 
58
  st.title('ITACA Insurance Core AI Module')
59
 
 
 
60
  if page == "Clustering Analysis":
61
- st.header('Clustering Analysis')
62
-
63
- st.write(
64
- """
65
- """
66
- )
67
-
68
- # import pycaret unsupervised models
69
- from pycaret.clustering import setup, create_model, assign_model, pull, plot_model
70
- # import ClusteringExperiment
71
- from pycaret.clustering import ClusteringExperiment
72
-
73
- # Display the list of CSV files
74
- directory = "./"
75
- all_files = os.listdir(directory)
76
- # Filter files to only include CSV files
77
- csv_files = [file for file in all_files if file.endswith(".csv")]
78
- # Select a CSV file from the list
79
- selected_csv = st.selectbox("Select a CSV file from the list", ["None"] + csv_files)
80
-
81
- # Upload the CSV file
82
- uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
83
-
84
- # Define the unsupervised model
85
- clusteringmodel = ['kmeans', 'ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics', 'birch']
86
- selected_model = st.selectbox("Choose a clustering model", clusteringmodel)
87
-
88
- # Read and display the CSV file
89
- if selected_csv != "None" or uploaded_file is not None:
90
- if uploaded_file:
91
- try:
92
- delimiter = ','
93
- insurance_claims = pd.read_csv (uploaded_file, sep=delimiter)
94
- except ValueError:
95
- delimiter = '|'
96
- insurance_claims = pd.read_csv (uploaded_file, sep=delimiter, encoding='latin-1')
97
- else:
98
- insurance_claims = pd.read_csv(selected_csv)
99
-
100
- num_rows = int(insurance_claims.shape[0]*int(num_lines)/100)
101
- insurance_claims_reduced = insurance_claims.head(num_rows)
102
- st.write("Rows to be processed: " + str(num_rows))
103
-
104
- all_columns = insurance_claims_reduced.columns.tolist()
105
- selected_columns = st.multiselect("Choose columns", all_columns, default=all_columns)
106
- insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy()
107
 
108
- st.header("Inference Description")
109
- insurance_claims_reduced.describe().T
110
-
111
- cat_col = insurance_claims_reduced.select_dtypes(include=['object']).columns
112
- num_col = insurance_claims_reduced.select_dtypes(exclude=['object']).columns
113
-
114
- # insurance_claims[num_col].hist(bins=15, figsize=(20, 15), layout=(5, 4))
115
- # Calculate the correlation matrix
116
- corr_matrix = insurance_claims_reduced[num_col].corr()
117
- # Create a Matplotlib figure
118
- fig, ax = plt.subplots(figsize=(12, 8))
119
- # Create a heatmap using seaborn
120
- st.header("Heat Map")
121
- sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=ax)
122
- # Set the title for the heatmap
123
- ax.set_title('Correlation Heatmap')
124
- # Display the heatmap in Streamlit
125
- st.pyplot(fig)
 
126
 
127
  if st.button("Prediction"):
128
  #insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy()
@@ -136,153 +141,189 @@ def main():
136
  exp_clustering.setup(insurance_claims_reduced, session_id = 123)
137
 
138
  with st.spinner("Analyzing..."):
139
- # train kmeans model
140
- cluster_model = create_model(selected_model, num_clusters = selected_clusters)
141
-
142
- cluster_model_2 = assign_model(cluster_model)
143
- # Calculate summary statistics for each cluster
144
- cluster_summary = cluster_model_2.groupby('Cluster').agg(['count', 'mean', 'median', 'min', 'max',
145
- 'std', 'var', 'sum', ('quantile_25', lambda x: x.quantile(0.25)),
146
- ('quantile_75', lambda x: x.quantile(0.75)), 'skew'])
147
- st.header("Cluster Summary")
148
- cluster_summary
149
- st.header("Assign Model")
150
- cluster_model_2
151
-
152
- # all_metrics = get_metrics()
153
- # all_metrics
154
-
155
- st.header("Clustering Metrics")
156
- cluster_results = pull()
157
- cluster_results
158
-
159
- if graph_select:
160
- st.header("Clustering Plots")
161
- # plot pca cluster plot
162
- plot_model(cluster_model, plot = 'cluster', display_format = 'streamlit')
163
 
164
- if selected_model != 'ap':
165
- plot_model(cluster_model, plot = 'tsne', display_format = 'streamlit')
166
-
167
- if selected_model not in ('ap', 'meanshift', 'dbscan', 'optics'):
168
- plot_model(cluster_model, plot = 'elbow', display_format = 'streamlit')
169
-
170
- if selected_model not in ('ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics'):
171
- plot_model(cluster_model, plot = 'silhouette', display_format = 'streamlit')
172
-
173
- if selected_model not in ('ap', 'sc', 'hclust', 'dbscan', 'optics', 'birch'):
174
- plot_model(cluster_model, plot = 'distance', display_format = 'streamlit')
175
-
176
- if selected_model != 'ap':
177
- plot_model(cluster_model, plot = 'distribution', display_format = 'streamlit')
178
-
179
- # Create a Classification Model to extract feature importance
180
- if feat_imp_select:
181
- st.header("Feature Importance")
182
- from pycaret.classification import setup, create_model, get_config
183
- s = setup(cluster_model_2, target = 'Cluster')
184
- lr = create_model('lr')
185
-
186
- # this is how you can recreate the table
187
- feat_imp = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value' : abs(lr.coef_[0])}).sort_values(by='Value', ascending=False)
188
- # sort by feature importance value and filter top 10
189
- feat_imp = feat_imp.sort_values(by='Value', ascending=False).head(10)
190
- # Display the filtered table in Streamlit
191
- # st.dataframe(feat_imp)
192
- # Display the filtered table as a bar chart in Streamlit
193
- st.bar_chart(feat_imp.set_index('Feature'))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
  elif page == "Anomaly Detection":
196
- st.header('Anomaly Detection')
197
-
198
- st.write(
199
- """
200
- """
201
- )
202
-
203
- # import pycaret anomaly
204
- from pycaret.anomaly import setup, create_model, assign_model, pull, plot_model
205
- # import AnomalyExperiment
206
- from pycaret.anomaly import AnomalyExperiment
207
-
208
- # Display the list of CSV files
209
- directory = "./"
210
- all_files = os.listdir(directory)
211
- # Filter files to only include CSV files
212
- csv_files = [file for file in all_files if file.endswith(".csv")]
213
- # Select a CSV file from the list
214
- selected_csv = st.selectbox("Select a CSV file from the list", ["None"] + csv_files)
215
-
216
- # Upload the CSV file
217
- uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
218
-
219
- # Define the unsupervised model
220
- anomalymodel = ['abod', 'cluster', 'cof', 'iforest', 'histogram', 'knn', 'lof', 'svm', 'pca', 'mcd', 'sod', 'sos']
221
- selected_model = st.selectbox("Choose an anomaly model", anomalymodel)
222
-
223
- # Read and display the CSV file
224
- if selected_csv != "None" or uploaded_file is not None:
225
- if uploaded_file:
226
- try:
227
- delimiter = ','
228
- insurance_claims = pd.read_csv (uploaded_file, sep=delimiter)
229
- except ValueError:
230
- delimiter = '|'
231
- insurance_claims = pd.read_csv (uploaded_file, sep=delimiter, encoding='latin-1')
232
- else:
233
- insurance_claims = pd.read_csv(selected_csv)
234
-
235
- num_rows = int(insurance_claims.shape[0]*int(num_lines)/100)
236
- insurance_claims_reduced = insurance_claims.head(num_rows)
237
- st.write("Rows to be processed: " + str(num_rows))
238
-
239
- all_columns = insurance_claims_reduced.columns.tolist()
240
- selected_columns = st.multiselect("Choose columns", all_columns, default=all_columns)
241
- insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy()
242
-
243
- if st.button("Prediction"):
244
-
245
- s = setup(insurance_claims_reduced, session_id = 123, remove_multicollinearity=p_remove_multicollinearity, multicollinearity_threshold=p_multicollinearity_threshold,
246
- # remove_outliers=p_remove_outliers, outliers_method=p_outliers_method,
247
- transformation=p_transformation,
248
- normalize=p_normalize, pca=p_pca, pca_method=p_pca_method)
249
 
250
- exp_anomaly = AnomalyExperiment()
251
- # init setup on exp
252
- exp_anomaly.setup(insurance_claims_reduced, session_id = 123)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
 
254
- with st.spinner("Analyzing..."):
255
- # train model
256
- anomaly_model = create_model(selected_model)
257
-
258
- st.header("Assign Model")
259
- anomaly_model_2 = assign_model(anomaly_model)
260
- anomaly_model_2
261
-
262
- st.header("Anomaly Metrics")
263
- anomaly_results = pull()
264
- anomaly_results
265
-
266
- if graph_select:
267
- # plot
268
- st.header("Anomaly Plots")
269
- plot_model(anomaly_model, plot = 'tsne', display_format = 'streamlit')
270
- plot_model(anomaly_model, plot = 'umap', display_format = 'streamlit')
271
-
272
- if feat_imp_select:
273
- # Create a Classification Model to extract feature importance
274
- st.header("Feature Importance")
275
- from pycaret.classification import setup, create_model, get_config
276
- s = setup(anomaly_model_2, target = 'Anomaly')
277
- lr = create_model('lr')
278
- # this is how you can recreate the table
279
- feat_imp = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value' : abs(lr.coef_[0])}).sort_values(by='Value', ascending=False)
280
- # sort by feature importance value and filter top 10
281
- feat_imp = feat_imp.sort_values(by='Value', ascending=False).head(10)
282
- # Display the filtered table in Streamlit
283
- # st.dataframe(feat_imp)
284
- # Display the filtered table as a bar chart in Streamlit
285
- st.bar_chart(feat_imp.set_index('Feature'))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
  try:
287
  main()
288
  except Exception as e:
 
14
  from PIL import ImageFont
15
 
16
  def main():
17
+ st.set_page_config(layout="wide")
18
+
19
  hide_streamlit_style = """
20
  <style>
21
  #MainMenu {visibility: hidden;}
 
59
 
60
  st.title('ITACA Insurance Core AI Module')
61
 
62
+ col1, col2 = st.columns(2)
63
+
64
  if page == "Clustering Analysis":
65
+ with col1:
66
+ st.header('Clustering Analysis')
67
+
68
+ st.write(
69
+ """
70
+ """
71
+ )
72
+ # import pycaret unsupervised models
73
+ from pycaret.clustering import setup, create_model, assign_model, pull, plot_model
74
+ # import ClusteringExperiment
75
+ from pycaret.clustering import ClusteringExperiment
76
+
77
+ # Display the list of CSV files
78
+ directory = "./"
79
+ all_files = os.listdir(directory)
80
+ # Filter files to only include CSV files
81
+ csv_files = [file for file in all_files if file.endswith(".csv")]
82
+ # Select a CSV file from the list
83
+ selected_csv = st.selectbox("Select a CSV file from the list", ["None"] + csv_files)
84
+
85
+ # Upload the CSV file
86
+ uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
87
+
88
+ # Define the unsupervised model
89
+ clusteringmodel = ['kmeans', 'ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics', 'birch']
90
+ selected_model = st.selectbox("Choose a clustering model", clusteringmodel)
91
+
92
+ # Read and display the CSV file
93
+ if selected_csv != "None" or uploaded_file is not None:
94
+ if uploaded_file:
95
+ try:
96
+ delimiter = ','
97
+ insurance_claims = pd.read_csv (uploaded_file, sep=delimiter)
98
+ except ValueError:
99
+ delimiter = '|'
100
+ insurance_claims = pd.read_csv (uploaded_file, sep=delimiter, encoding='latin-1')
101
+ else:
102
+ insurance_claims = pd.read_csv(selected_csv)
103
+
104
+ num_rows = int(insurance_claims.shape[0]*int(num_lines)/100)
105
+ insurance_claims_reduced = insurance_claims.head(num_rows)
106
+ st.write("Rows to be processed: " + str(num_rows))
107
+
108
+ all_columns = insurance_claims_reduced.columns.tolist()
109
+ selected_columns = st.multiselect("Choose columns", all_columns, default=all_columns)
110
+ insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy()
111
 
112
+ with st.expander("Inference Description", expanded=True):
113
+ insurance_claims_reduced.describe().T
114
+
115
+ with st.expander("Head Map", expanded=True):
116
+ cat_col = insurance_claims_reduced.select_dtypes(include=['object']).columns
117
+ num_col = insurance_claims_reduced.select_dtypes(exclude=['object']).columns
118
+
119
+ # insurance_claims[num_col].hist(bins=15, figsize=(20, 15), layout=(5, 4))
120
+ # Calculate the correlation matrix
121
+ corr_matrix = insurance_claims_reduced[num_col].corr()
122
+ # Create a Matplotlib figure
123
+ fig, ax = plt.subplots(figsize=(12, 8))
124
+ # Create a heatmap using seaborn
125
+ #st.header("Heat Map")
126
+ sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=ax)
127
+ # Set the title for the heatmap
128
+ ax.set_title('Correlation Heatmap')
129
+ # Display the heatmap in Streamlit
130
+ st.pyplot(fig)
131
 
132
  if st.button("Prediction"):
133
  #insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy()
 
141
  exp_clustering.setup(insurance_claims_reduced, session_id = 123)
142
 
143
  with st.spinner("Analyzing..."):
144
+ with col2:
145
+ st.markdown("<br><br><br><br>", unsafe_allow_html=True)
146
+ # train kmeans model
147
+ cluster_model = create_model(selected_model, num_clusters = selected_clusters)
148
+
149
+ cluster_model_2 = assign_model(cluster_model)
150
+ # Calculate summary statistics for each cluster
151
+ cluster_summary = cluster_model_2.groupby('Cluster').agg(['count', 'mean', 'median', 'min', 'max',
152
+ 'std', 'var', 'sum', ('quantile_25', lambda x: x.quantile(0.25)),
153
+ ('quantile_75', lambda x: x.quantile(0.75)), 'skew'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
+ with st.expander("Cluster Summary", expanded=False):
156
+ #st.header("Cluster Summary")
157
+ cluster_summary
158
+
159
+ with st.expander("Model Assign", expanded=False):
160
+ #st.header("Assign Model")
161
+ cluster_model_2
162
+
163
+ # all_metrics = get_metrics()
164
+ # all_metrics
165
+
166
+ with st.expander("Clustering Metrics", expanded=False):
167
+ #st.header("Clustering Metrics")
168
+ cluster_results = pull()
169
+ cluster_results
170
+
171
+ with st.expander("Clustering Plots", expanded=False):
172
+ if graph_select:
173
+ #st.header("Clustering Plots")
174
+ # plot pca cluster plot
175
+ plot_model(cluster_model, plot = 'cluster', display_format = 'streamlit')
176
+
177
+ if selected_model != 'ap':
178
+ plot_model(cluster_model, plot = 'tsne', display_format = 'streamlit')
179
+
180
+ if selected_model not in ('ap', 'meanshift', 'dbscan', 'optics'):
181
+ plot_model(cluster_model, plot = 'elbow', display_format = 'streamlit')
182
+
183
+ if selected_model not in ('ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics'):
184
+ plot_model(cluster_model, plot = 'silhouette', display_format = 'streamlit')
185
+
186
+ if selected_model not in ('ap', 'sc', 'hclust', 'dbscan', 'optics', 'birch'):
187
+ plot_model(cluster_model, plot = 'distance', display_format = 'streamlit')
188
+
189
+ if selected_model != 'ap':
190
+ plot_model(cluster_model, plot = 'distribution', display_format = 'streamlit')
191
+
192
+ with st.expander("Feature Importance", expanded=False):
193
+ # Create a Classification Model to extract feature importance
194
+ if graph_select and feat_imp_select:
195
+ #st.header("Feature Importance")
196
+ from pycaret.classification import setup, create_model, get_config
197
+ s = setup(cluster_model_2, target = 'Cluster')
198
+ lr = create_model('lr')
199
+
200
+ # this is how you can recreate the table
201
+ feat_imp = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value' : abs(lr.coef_[0])}).sort_values(by='Value', ascending=False)
202
+ # sort by feature importance value and filter top 10
203
+ feat_imp = feat_imp.sort_values(by='Value', ascending=False).head(10)
204
+ # Display the filtered table in Streamlit
205
+ # st.dataframe(feat_imp)
206
+ # Display the filtered table as a bar chart in Streamlit
207
+ st.bar_chart(feat_imp.set_index('Feature'))
208
 
209
  elif page == "Anomaly Detection":
210
+ with col1:
211
+ st.header('Anomaly Detection')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
 
213
+ st.write(
214
+ """
215
+ """
216
+ )
217
+
218
+ # import pycaret anomaly
219
+ from pycaret.anomaly import setup, create_model, assign_model, pull, plot_model
220
+ # import AnomalyExperiment
221
+ from pycaret.anomaly import AnomalyExperiment
222
+
223
+ # Display the list of CSV files
224
+ directory = "./"
225
+ all_files = os.listdir(directory)
226
+ # Filter files to only include CSV files
227
+ csv_files = [file for file in all_files if file.endswith(".csv")]
228
+ # Select a CSV file from the list
229
+ selected_csv = st.selectbox("Select a CSV file from the list", ["None"] + csv_files)
230
 
231
+ # Upload the CSV file
232
+ uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
233
+
234
+ # Define the unsupervised model
235
+ anomalymodel = ['abod', 'cluster', 'cof', 'iforest', 'histogram', 'knn', 'lof', 'svm', 'pca', 'mcd', 'sod', 'sos']
236
+ selected_model = st.selectbox("Choose an anomaly model", anomalymodel)
237
+
238
+ # Read and display the CSV file
239
+ if selected_csv != "None" or uploaded_file is not None:
240
+ if uploaded_file:
241
+ try:
242
+ delimiter = ','
243
+ insurance_claims = pd.read_csv (uploaded_file, sep=delimiter)
244
+ except ValueError:
245
+ delimiter = '|'
246
+ insurance_claims = pd.read_csv (uploaded_file, sep=delimiter, encoding='latin-1')
247
+ else:
248
+ insurance_claims = pd.read_csv(selected_csv)
249
+
250
+ num_rows = int(insurance_claims.shape[0]*int(num_lines)/100)
251
+ insurance_claims_reduced = insurance_claims.head(num_rows)
252
+ st.write("Rows to be processed: " + str(num_rows))
253
+
254
+ all_columns = insurance_claims_reduced.columns.tolist()
255
+ selected_columns = st.multiselect("Choose columns", all_columns, default=all_columns)
256
+ insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy()
257
+
258
+ with st.expander("Inference Description", expanded=True):
259
+ insurance_claims_reduced.describe().T
260
+
261
+ with st.expander("Head Map", expanded=True):
262
+ cat_col = insurance_claims_reduced.select_dtypes(include=['object']).columns
263
+ num_col = insurance_claims_reduced.select_dtypes(exclude=['object']).columns
264
+
265
+ # insurance_claims[num_col].hist(bins=15, figsize=(20, 15), layout=(5, 4))
266
+ # Calculate the correlation matrix
267
+ corr_matrix = insurance_claims_reduced[num_col].corr()
268
+ # Create a Matplotlib figure
269
+ fig, ax = plt.subplots(figsize=(12, 8))
270
+ # Create a heatmap using seaborn
271
+ #st.header("Heat Map")
272
+ sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=ax)
273
+ # Set the title for the heatmap
274
+ ax.set_title('Correlation Heatmap')
275
+ # Display the heatmap in Streamlit
276
+ st.pyplot(fig)
277
+
278
+ if st.button("Prediction"):
279
+
280
+ s = setup(insurance_claims_reduced, session_id = 123, remove_multicollinearity=p_remove_multicollinearity, multicollinearity_threshold=p_multicollinearity_threshold,
281
+ # remove_outliers=p_remove_outliers, outliers_method=p_outliers_method,
282
+ transformation=p_transformation,
283
+ normalize=p_normalize, pca=p_pca, pca_method=p_pca_method)
284
+
285
+ exp_anomaly = AnomalyExperiment()
286
+ # init setup on exp
287
+ exp_anomaly.setup(insurance_claims_reduced, session_id = 123)
288
+
289
+ with st.spinner("Analyzing..."):
290
+ with col2:
291
+ st.markdown("<br><br><br><br>", unsafe_allow_html=True)
292
+ # train model
293
+ anomaly_model = create_model(selected_model)
294
+
295
+ with st.expander("Assign Model", expanded=False):
296
+ #st.header("Assign Model")
297
+ anomaly_model_2 = assign_model(anomaly_model)
298
+ anomaly_model_2
299
+
300
+ with st.expander("Anomaly Metrics", expanded=False):
301
+ #st.header("Anomaly Metrics")
302
+ anomaly_results = pull()
303
+ anomaly_results
304
+
305
+ with st.expander("Anomaly Plots", expanded=False):
306
+ if graph_select:
307
+ # plot
308
+ #st.header("Anomaly Plots")
309
+ plot_model(anomaly_model, plot = 'tsne', display_format = 'streamlit')
310
+ plot_model(anomaly_model, plot = 'umap', display_format = 'streamlit')
311
+
312
+ with st.expander("Feature Importance", expanded=False):
313
+ if graph_select and feat_imp_select:
314
+ # Create a Classification Model to extract feature importance
315
+ #st.header("Feature Importance")
316
+ from pycaret.classification import setup, create_model, get_config
317
+ s = setup(anomaly_model_2, target = 'Anomaly')
318
+ lr = create_model('lr')
319
+ # this is how you can recreate the table
320
+ feat_imp = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value' : abs(lr.coef_[0])}).sort_values(by='Value', ascending=False)
321
+ # sort by feature importance value and filter top 10
322
+ feat_imp = feat_imp.sort_values(by='Value', ascending=False).head(10)
323
+ # Display the filtered table in Streamlit
324
+ # st.dataframe(feat_imp)
325
+ # Display the filtered table as a bar chart in Streamlit
326
+ st.bar_chart(feat_imp.set_index('Feature'))
327
  try:
328
  main()
329
  except Exception as e: