dperales commited on
Commit
d41e58c
·
1 Parent(s): c311506

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +141 -141
app.py CHANGED
@@ -59,152 +59,152 @@ def main():
59
 
60
  st.title('ITACA Insurance Core AI Module')
61
 
62
- col1, col2 = st.columns(2)
63
 
64
  if page == "Clustering Analysis":
65
- with col1:
66
- st.header('Clustering Analysis')
67
-
68
- st.write(
69
- """
70
- """
71
- )
72
- # import pycaret unsupervised models
73
- from pycaret.clustering import setup, create_model, assign_model, pull, plot_model
74
- # import ClusteringExperiment
75
- from pycaret.clustering import ClusteringExperiment
76
-
77
- # Display the list of CSV files
78
- directory = "./"
79
- all_files = os.listdir(directory)
80
- # Filter files to only include CSV files
81
- csv_files = [file for file in all_files if file.endswith(".csv")]
82
- # Select a CSV file from the list
83
- selected_csv = st.selectbox("Select a CSV file from the list", ["None"] + csv_files)
84
-
85
- # Upload the CSV file
86
- uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
87
-
88
- # Define the unsupervised model
89
- clusteringmodel = ['kmeans', 'ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics', 'birch']
90
- selected_model = st.selectbox("Choose a clustering model", clusteringmodel)
91
-
92
- # Read and display the CSV file
93
- if selected_csv != "None" or uploaded_file is not None:
94
- if uploaded_file:
95
- try:
96
- delimiter = ','
97
- insurance_claims = pd.read_csv (uploaded_file, sep=delimiter)
98
- except ValueError:
99
- delimiter = '|'
100
- insurance_claims = pd.read_csv (uploaded_file, sep=delimiter, encoding='latin-1')
101
- else:
102
- insurance_claims = pd.read_csv(selected_csv)
103
-
104
- num_rows = int(insurance_claims.shape[0]*int(num_lines)/100)
105
- insurance_claims_reduced = insurance_claims.head(num_rows)
106
- st.write("Rows to be processed: " + str(num_rows))
107
-
108
- all_columns = insurance_claims_reduced.columns.tolist()
109
- selected_columns = st.multiselect("Choose columns", all_columns, default=all_columns)
110
- insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
- with st.expander("Inference Description", expanded=True):
113
- insurance_claims_reduced.describe().T
114
-
115
- with st.expander("Head Map", expanded=True):
116
- cat_col = insurance_claims_reduced.select_dtypes(include=['object']).columns
117
- num_col = insurance_claims_reduced.select_dtypes(exclude=['object']).columns
118
-
119
- # insurance_claims[num_col].hist(bins=15, figsize=(20, 15), layout=(5, 4))
120
- # Calculate the correlation matrix
121
- corr_matrix = insurance_claims_reduced[num_col].corr()
122
- # Create a Matplotlib figure
123
- fig, ax = plt.subplots(figsize=(12, 8))
124
- # Create a heatmap using seaborn
125
- #st.header("Heat Map")
126
- sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=ax)
127
- # Set the title for the heatmap
128
- ax.set_title('Correlation Heatmap')
129
- # Display the heatmap in Streamlit
130
- st.pyplot(fig)
131
-
132
- if st.button("Prediction"):
133
- #insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy()
134
 
135
- s = setup(insurance_claims_reduced, session_id = 123, remove_multicollinearity=p_remove_multicollinearity, multicollinearity_threshold=p_multicollinearity_threshold,
136
- # remove_outliers=p_remove_outliers, outliers_method=p_outliers_method,
137
- transformation=p_transformation,
138
- normalize=p_normalize, pca=p_pca, pca_method=p_pca_method)
139
- exp_clustering = ClusteringExperiment()
140
- # init setup on exp
141
- exp_clustering.setup(insurance_claims_reduced, session_id = 123)
142
-
143
- with st.spinner("Analyzing..."):
144
- with col1:
145
- st.markdown("<br><br><br><br>", unsafe_allow_html=True)
146
- # train kmeans model
147
- cluster_model = create_model(selected_model, num_clusters = selected_clusters)
148
-
149
- cluster_model_2 = assign_model(cluster_model)
150
- # Calculate summary statistics for each cluster
151
- cluster_summary = cluster_model_2.groupby('Cluster').agg(['count', 'mean', 'median', 'min', 'max',
152
- 'std', 'var', 'sum', ('quantile_25', lambda x: x.quantile(0.25)),
153
- ('quantile_75', lambda x: x.quantile(0.75)), 'skew'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
- with st.expander("Cluster Summary", expanded=False):
156
- #st.header("Cluster Summary")
157
- cluster_summary
158
-
159
- with st.expander("Model Assign", expanded=False):
160
- #st.header("Assign Model")
161
- cluster_model_2
162
-
163
- # all_metrics = get_metrics()
164
- # all_metrics
165
-
166
- with st.expander("Clustering Metrics", expanded=False):
167
- #st.header("Clustering Metrics")
168
- cluster_results = pull()
169
- cluster_results
170
-
171
- with st.expander("Clustering Plots", expanded=False):
172
- if graph_select:
173
- #st.header("Clustering Plots")
174
- # plot pca cluster plot
175
- plot_model(cluster_model, plot = 'cluster', display_format = 'streamlit')
176
-
177
- if selected_model != 'ap':
178
- plot_model(cluster_model, plot = 'tsne', display_format = 'streamlit')
179
-
180
- if selected_model not in ('ap', 'meanshift', 'dbscan', 'optics'):
181
- plot_model(cluster_model, plot = 'elbow', display_format = 'streamlit')
182
-
183
- if selected_model not in ('ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics'):
184
- plot_model(cluster_model, plot = 'silhouette', display_format = 'streamlit')
185
-
186
- if selected_model not in ('ap', 'sc', 'hclust', 'dbscan', 'optics', 'birch'):
187
- plot_model(cluster_model, plot = 'distance', display_format = 'streamlit')
188
-
189
- if selected_model != 'ap':
190
- plot_model(cluster_model, plot = 'distribution', display_format = 'streamlit')
191
-
192
- with st.expander("Feature Importance", expanded=False):
193
- # Create a Classification Model to extract feature importance
194
- if graph_select and feat_imp_select:
195
- #st.header("Feature Importance")
196
- from pycaret.classification import setup, create_model, get_config
197
- s = setup(cluster_model_2, target = 'Cluster')
198
- lr = create_model('lr')
199
-
200
- # this is how you can recreate the table
201
- feat_imp = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value' : abs(lr.coef_[0])}).sort_values(by='Value', ascending=False)
202
- # sort by feature importance value and filter top 10
203
- feat_imp = feat_imp.sort_values(by='Value', ascending=False).head(10)
204
- # Display the filtered table in Streamlit
205
- # st.dataframe(feat_imp)
206
- # Display the filtered table as a bar chart in Streamlit
207
- st.bar_chart(feat_imp.set_index('Feature'))
208
 
209
  elif page == "Anomaly Detection":
210
  with col1:
 
59
 
60
  st.title('ITACA Insurance Core AI Module')
61
 
62
+ #col1, col2 = st.columns(2)
63
 
64
  if page == "Clustering Analysis":
65
+ #with col1:
66
+ st.header('Clustering Analysis')
67
+
68
+ st.write(
69
+ """
70
+ """
71
+ )
72
+ # import pycaret unsupervised models
73
+ from pycaret.clustering import setup, create_model, assign_model, pull, plot_model
74
+ # import ClusteringExperiment
75
+ from pycaret.clustering import ClusteringExperiment
76
+
77
+ # Display the list of CSV files
78
+ directory = "./"
79
+ all_files = os.listdir(directory)
80
+ # Filter files to only include CSV files
81
+ csv_files = [file for file in all_files if file.endswith(".csv")]
82
+ # Select a CSV file from the list
83
+ selected_csv = st.selectbox("Select a CSV file from the list", ["None"] + csv_files)
84
+
85
+ # Upload the CSV file
86
+ uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
87
+
88
+ # Define the unsupervised model
89
+ clusteringmodel = ['kmeans', 'ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics', 'birch']
90
+ selected_model = st.selectbox("Choose a clustering model", clusteringmodel)
91
+
92
+ # Read and display the CSV file
93
+ if selected_csv != "None" or uploaded_file is not None:
94
+ if uploaded_file:
95
+ try:
96
+ delimiter = ','
97
+ insurance_claims = pd.read_csv (uploaded_file, sep=delimiter)
98
+ except ValueError:
99
+ delimiter = '|'
100
+ insurance_claims = pd.read_csv (uploaded_file, sep=delimiter, encoding='latin-1')
101
+ else:
102
+ insurance_claims = pd.read_csv(selected_csv)
103
+
104
+ num_rows = int(insurance_claims.shape[0]*int(num_lines)/100)
105
+ insurance_claims_reduced = insurance_claims.head(num_rows)
106
+ st.write("Rows to be processed: " + str(num_rows))
107
+
108
+ all_columns = insurance_claims_reduced.columns.tolist()
109
+ selected_columns = st.multiselect("Choose columns", all_columns, default=all_columns)
110
+ insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy()
111
+
112
+ with st.expander("Inference Description", expanded=True):
113
+ insurance_claims_reduced.describe().T
114
+
115
+ with st.expander("Head Map", expanded=True):
116
+ cat_col = insurance_claims_reduced.select_dtypes(include=['object']).columns
117
+ num_col = insurance_claims_reduced.select_dtypes(exclude=['object']).columns
118
+
119
+ # insurance_claims[num_col].hist(bins=15, figsize=(20, 15), layout=(5, 4))
120
+ # Calculate the correlation matrix
121
+ corr_matrix = insurance_claims_reduced[num_col].corr()
122
+ # Create a Matplotlib figure
123
+ fig, ax = plt.subplots(figsize=(12, 8))
124
+ # Create a heatmap using seaborn
125
+ #st.header("Heat Map")
126
+ sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=ax)
127
+ # Set the title for the heatmap
128
+ ax.set_title('Correlation Heatmap')
129
+ # Display the heatmap in Streamlit
130
+ st.pyplot(fig)
131
+
132
+ if st.button("Prediction"):
133
+ #insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy()
134
 
135
+ s = setup(insurance_claims_reduced, session_id = 123, remove_multicollinearity=p_remove_multicollinearity, multicollinearity_threshold=p_multicollinearity_threshold,
136
+ # remove_outliers=p_remove_outliers, outliers_method=p_outliers_method,
137
+ transformation=p_transformation,
138
+ normalize=p_normalize, pca=p_pca, pca_method=p_pca_method)
139
+ exp_clustering = ClusteringExperiment()
140
+ # init setup on exp
141
+ exp_clustering.setup(insurance_claims_reduced, session_id = 123)
142
+
143
+ with st.spinner("Analyzing..."):
144
+ #with col2:
145
+ st.markdown("<br><br><br><br>", unsafe_allow_html=True)
146
+ # train kmeans model
147
+ cluster_model = create_model(selected_model, num_clusters = selected_clusters)
148
+
149
+ cluster_model_2 = assign_model(cluster_model)
150
+ # Calculate summary statistics for each cluster
151
+ cluster_summary = cluster_model_2.groupby('Cluster').agg(['count', 'mean', 'median', 'min', 'max',
152
+ 'std', 'var', 'sum', ('quantile_25', lambda x: x.quantile(0.25)),
153
+ ('quantile_75', lambda x: x.quantile(0.75)), 'skew'])
 
 
 
154
 
155
+ with st.expander("Cluster Summary", expanded=False):
156
+ #st.header("Cluster Summary")
157
+ cluster_summary
158
+
159
+ with st.expander("Model Assign", expanded=False):
160
+ #st.header("Assign Model")
161
+ cluster_model_2
162
+
163
+ # all_metrics = get_metrics()
164
+ # all_metrics
165
+
166
+ with st.expander("Clustering Metrics", expanded=False):
167
+ #st.header("Clustering Metrics")
168
+ cluster_results = pull()
169
+ cluster_results
170
+
171
+ with st.expander("Clustering Plots", expanded=False):
172
+ if graph_select:
173
+ #st.header("Clustering Plots")
174
+ # plot pca cluster plot
175
+ plot_model(cluster_model, plot = 'cluster', display_format = 'streamlit')
176
+
177
+ if selected_model != 'ap':
178
+ plot_model(cluster_model, plot = 'tsne', display_format = 'streamlit')
179
+
180
+ if selected_model not in ('ap', 'meanshift', 'dbscan', 'optics'):
181
+ plot_model(cluster_model, plot = 'elbow', display_format = 'streamlit')
182
+
183
+ if selected_model not in ('ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics'):
184
+ plot_model(cluster_model, plot = 'silhouette', display_format = 'streamlit')
185
+
186
+ if selected_model not in ('ap', 'sc', 'hclust', 'dbscan', 'optics', 'birch'):
187
+ plot_model(cluster_model, plot = 'distance', display_format = 'streamlit')
188
+
189
+ if selected_model != 'ap':
190
+ plot_model(cluster_model, plot = 'distribution', display_format = 'streamlit')
191
+
192
+ with st.expander("Feature Importance", expanded=False):
193
+ # Create a Classification Model to extract feature importance
194
+ if graph_select and feat_imp_select:
195
+ #st.header("Feature Importance")
196
+ from pycaret.classification import setup, create_model, get_config
197
+ s = setup(cluster_model_2, target = 'Cluster')
198
+ lr = create_model('lr')
199
 
200
+ # this is how you can recreate the table
201
+ feat_imp = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value' : abs(lr.coef_[0])}).sort_values(by='Value', ascending=False)
202
+ # sort by feature importance value and filter top 10
203
+ feat_imp = feat_imp.sort_values(by='Value', ascending=False).head(10)
204
+ # Display the filtered table in Streamlit
205
+ # st.dataframe(feat_imp)
206
+ # Display the filtered table as a bar chart in Streamlit
207
+ st.bar_chart(feat_imp.set_index('Feature'))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
 
209
  elif page == "Anomaly Detection":
210
  with col1: