Spaces:

dperales
/

Fraud_Detection_Pycaret

Runtime error

App Files Files Community

dperales commited on Apr 15, 2023

Commit

02d2227

1 Parent(s): a5daa47

Update app.py

Browse files

Files changed (1) hide show

app.py +247 -206

app.py CHANGED Viewed

@@ -14,6 +14,8 @@ from PIL import ImageDraw
 from PIL import ImageFont
 def main():
     hide_streamlit_style = """
                 <style>
                 #MainMenu {visibility: hidden;}
@@ -57,72 +59,75 @@ def main():
     st.title('ITACA Insurance Core AI Module')
     if page == "Clustering Analysis":
-        st.header('Clustering Analysis')
-        st.write(
-            """
-            """
-        )
-        # import pycaret unsupervised models
-        from pycaret.clustering import setup, create_model, assign_model, pull, plot_model
-        # import ClusteringExperiment
-        from pycaret.clustering import ClusteringExperiment
-        # Display the list of CSV files
-        directory = "./"
-        all_files = os.listdir(directory)
-        # Filter files to only include CSV files
-        csv_files = [file for file in all_files if file.endswith(".csv")]
-        # Select a CSV file from the list
-        selected_csv = st.selectbox("Select a CSV file from the list", ["None"] + csv_files)
-        # Upload the CSV file
-        uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
-        # Define the unsupervised model
-        clusteringmodel = ['kmeans', 'ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics', 'birch']
-        selected_model = st.selectbox("Choose a clustering model", clusteringmodel)
-        # Read and display the CSV file
-        if selected_csv != "None" or uploaded_file is not None:
-            if uploaded_file:
-                try:
-                    delimiter = ','
-                    insurance_claims = pd.read_csv (uploaded_file, sep=delimiter)
-                except ValueError:
-                    delimiter = '|'
-                    insurance_claims = pd.read_csv (uploaded_file, sep=delimiter, encoding='latin-1')
-            else:
-                insurance_claims = pd.read_csv(selected_csv)
-            num_rows = int(insurance_claims.shape[0]*int(num_lines)/100)
-            insurance_claims_reduced = insurance_claims.head(num_rows)
-            st.write("Rows to be processed: " + str(num_rows))
-            all_columns = insurance_claims_reduced.columns.tolist()
-            selected_columns = st.multiselect("Choose columns", all_columns, default=all_columns)
-            insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy()
-            st.header("Inference Description")
-            insurance_claims_reduced.describe().T
-            cat_col = insurance_claims_reduced.select_dtypes(include=['object']).columns
-            num_col = insurance_claims_reduced.select_dtypes(exclude=['object']).columns
-            # insurance_claims[num_col].hist(bins=15, figsize=(20, 15), layout=(5, 4))
-            # Calculate the correlation matrix
-            corr_matrix = insurance_claims_reduced[num_col].corr()
-            # Create a Matplotlib figure
-            fig, ax = plt.subplots(figsize=(12, 8))
-            # Create a heatmap using seaborn
-            st.header("Heat Map")
-            sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=ax)
-            # Set the title for the heatmap
-            ax.set_title('Correlation Heatmap')
-            # Display the heatmap in Streamlit
-            st.pyplot(fig)
             if st.button("Prediction"):
                 #insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy()
@@ -136,153 +141,189 @@ def main():
                 exp_clustering.setup(insurance_claims_reduced, session_id = 123)
                 with st.spinner("Analyzing..."):
-                    # train kmeans model
-                    cluster_model = create_model(selected_model, num_clusters = selected_clusters)
-                    cluster_model_2 = assign_model(cluster_model)
-                    # Calculate summary statistics for each cluster
-                    cluster_summary = cluster_model_2.groupby('Cluster').agg(['count', 'mean', 'median', 'min', 'max',
-                                                                                'std', 'var', 'sum', ('quantile_25', lambda x: x.quantile(0.25)),
-                                                                                ('quantile_75', lambda x: x.quantile(0.75)), 'skew'])
-                    st.header("Cluster Summary")
-                    cluster_summary
-                    st.header("Assign Model")
-                    cluster_model_2
-                    # all_metrics = get_metrics()
-                    # all_metrics
-                    st.header("Clustering Metrics")
-                    cluster_results = pull()
-                    cluster_results
-                    if graph_select:
-                        st.header("Clustering Plots")
-                        # plot pca cluster plot
-                        plot_model(cluster_model, plot = 'cluster', display_format = 'streamlit')
-                        if selected_model != 'ap':
-                            plot_model(cluster_model, plot = 'tsne', display_format = 'streamlit')
-                        if selected_model not in ('ap', 'meanshift', 'dbscan', 'optics'):
-                            plot_model(cluster_model, plot = 'elbow', display_format = 'streamlit')
-                        if selected_model not in ('ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics'):
-                            plot_model(cluster_model, plot = 'silhouette', display_format = 'streamlit')
-                        if selected_model not in ('ap', 'sc', 'hclust', 'dbscan', 'optics', 'birch'):
-                            plot_model(cluster_model, plot = 'distance', display_format = 'streamlit')
-                        if selected_model != 'ap':
-                            plot_model(cluster_model, plot = 'distribution', display_format = 'streamlit')
-                        # Create a Classification Model to extract feature importance
-                        if feat_imp_select:
-                            st.header("Feature Importance")
-                            from pycaret.classification import setup, create_model, get_config
-                            s = setup(cluster_model_2, target = 'Cluster')
-                            lr = create_model('lr')
-                            # this is how you can recreate the table
-                            feat_imp = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value' : abs(lr.coef_[0])}).sort_values(by='Value', ascending=False)
-                            # sort by feature importance value and filter top 10
-                            feat_imp = feat_imp.sort_values(by='Value', ascending=False).head(10)
-                            # Display the filtered table in Streamlit
-                            # st.dataframe(feat_imp)
-                            # Display the filtered table as a bar chart in Streamlit
-                            st.bar_chart(feat_imp.set_index('Feature'))
     elif page == "Anomaly Detection":
-        st.header('Anomaly Detection')
-        st.write(
-            """
-            """
-        )
-        # import pycaret anomaly
-        from pycaret.anomaly import setup, create_model, assign_model, pull, plot_model
-        # import AnomalyExperiment
-        from pycaret.anomaly import AnomalyExperiment
-        # Display the list of CSV files
-        directory = "./"
-        all_files = os.listdir(directory)
-        # Filter files to only include CSV files
-        csv_files = [file for file in all_files if file.endswith(".csv")]
-        # Select a CSV file from the list
-        selected_csv = st.selectbox("Select a CSV file from the list", ["None"] + csv_files)
-        # Upload the CSV file
-        uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
-        # Define the unsupervised model
-        anomalymodel = ['abod', 'cluster', 'cof', 'iforest', 'histogram', 'knn', 'lof', 'svm', 'pca', 'mcd', 'sod', 'sos']
-        selected_model = st.selectbox("Choose an anomaly model", anomalymodel)
-        # Read and display the CSV file
-        if selected_csv != "None" or uploaded_file is not None:
-            if uploaded_file:
-                try:
-                    delimiter = ','
-                    insurance_claims = pd.read_csv (uploaded_file, sep=delimiter)
-                except ValueError:
-                    delimiter = '|'
-                    insurance_claims = pd.read_csv (uploaded_file, sep=delimiter, encoding='latin-1')
-            else:
-                insurance_claims = pd.read_csv(selected_csv)
-            num_rows = int(insurance_claims.shape[0]*int(num_lines)/100)
-            insurance_claims_reduced = insurance_claims.head(num_rows)
-            st.write("Rows to be processed: " + str(num_rows))
-            all_columns = insurance_claims_reduced.columns.tolist()
-            selected_columns = st.multiselect("Choose columns", all_columns, default=all_columns)
-            insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy()
-            if st.button("Prediction"):
-                s = setup(insurance_claims_reduced, session_id = 123, remove_multicollinearity=p_remove_multicollinearity, multicollinearity_threshold=p_multicollinearity_threshold,
-                            # remove_outliers=p_remove_outliers, outliers_method=p_outliers_method,
-                            transformation=p_transformation,
-                            normalize=p_normalize, pca=p_pca, pca_method=p_pca_method)
-                exp_anomaly = AnomalyExperiment()
-                # init setup on exp
-                exp_anomaly.setup(insurance_claims_reduced, session_id = 123)
-                with st.spinner("Analyzing..."):
-                    # train model
-                    anomaly_model = create_model(selected_model)
-                    st.header("Assign Model")
-                    anomaly_model_2 = assign_model(anomaly_model)
-                    anomaly_model_2
-                    st.header("Anomaly Metrics")
-                    anomaly_results = pull()
-                    anomaly_results
-                    if graph_select:
-                        # plot
-                        st.header("Anomaly Plots")
-                        plot_model(anomaly_model, plot = 'tsne', display_format = 'streamlit')
-                        plot_model(anomaly_model, plot = 'umap', display_format = 'streamlit')
-                        if feat_imp_select:
-                            # Create a Classification Model to extract feature importance
-                            st.header("Feature Importance")
-                            from pycaret.classification import setup, create_model, get_config
-                            s = setup(anomaly_model_2, target = 'Anomaly')
-                            lr = create_model('lr')
-                            # this is how you can recreate the table
-                            feat_imp = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value' : abs(lr.coef_[0])}).sort_values(by='Value', ascending=False)
-                            # sort by feature importance value and filter top 10
-                            feat_imp = feat_imp.sort_values(by='Value', ascending=False).head(10)
-                            # Display the filtered table in Streamlit
-                            # st.dataframe(feat_imp)
-                            # Display the filtered table as a bar chart in Streamlit
-                            st.bar_chart(feat_imp.set_index('Feature'))
 try:
     main()
 except Exception as e:

 from PIL import ImageFont
 def main():
+    st.set_page_config(layout="wide")
     hide_streamlit_style = """
                 <style>
                 #MainMenu {visibility: hidden;}
     st.title('ITACA Insurance Core AI Module')
+    col1, col2 = st.columns(2)
     if page == "Clustering Analysis":
+        with col1:
+            st.header('Clustering Analysis')
+            st.write(
+                """
+                """
+            )
+            # import pycaret unsupervised models
+            from pycaret.clustering import setup, create_model, assign_model, pull, plot_model
+            # import ClusteringExperiment
+            from pycaret.clustering import ClusteringExperiment
+            # Display the list of CSV files
+            directory = "./"
+            all_files = os.listdir(directory)
+            # Filter files to only include CSV files
+            csv_files = [file for file in all_files if file.endswith(".csv")]
+            # Select a CSV file from the list
+            selected_csv = st.selectbox("Select a CSV file from the list", ["None"] + csv_files)
+            # Upload the CSV file
+            uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
+            # Define the unsupervised model
+            clusteringmodel = ['kmeans', 'ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics', 'birch']
+            selected_model = st.selectbox("Choose a clustering model", clusteringmodel)
+            # Read and display the CSV file
+            if selected_csv != "None" or uploaded_file is not None:
+                if uploaded_file:
+                    try:
+                        delimiter = ','
+                        insurance_claims = pd.read_csv (uploaded_file, sep=delimiter)
+                    except ValueError:
+                        delimiter = '|'
+                        insurance_claims = pd.read_csv (uploaded_file, sep=delimiter, encoding='latin-1')
+                else:
+                    insurance_claims = pd.read_csv(selected_csv)
+                num_rows = int(insurance_claims.shape[0]*int(num_lines)/100)
+                insurance_claims_reduced = insurance_claims.head(num_rows)
+                st.write("Rows to be processed: " + str(num_rows))
+                all_columns = insurance_claims_reduced.columns.tolist()
+                selected_columns = st.multiselect("Choose columns", all_columns, default=all_columns)
+                insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy()
+            with st.expander("Inference Description", expanded=True):
+                insurance_claims_reduced.describe().T
+            with st.expander("Head Map", expanded=True):
+                cat_col = insurance_claims_reduced.select_dtypes(include=['object']).columns
+                num_col = insurance_claims_reduced.select_dtypes(exclude=['object']).columns
+                # insurance_claims[num_col].hist(bins=15, figsize=(20, 15), layout=(5, 4))
+                # Calculate the correlation matrix
+                corr_matrix = insurance_claims_reduced[num_col].corr()
+                # Create a Matplotlib figure
+                fig, ax = plt.subplots(figsize=(12, 8))
+                # Create a heatmap using seaborn
+                #st.header("Heat Map")
+                sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=ax)
+                # Set the title for the heatmap
+                ax.set_title('Correlation Heatmap')
+                # Display the heatmap in Streamlit
+                st.pyplot(fig)
             if st.button("Prediction"):
                 #insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy()
                 exp_clustering.setup(insurance_claims_reduced, session_id = 123)
                 with st.spinner("Analyzing..."):
+                    with col2:
+                        st.markdown("<br><br><br><br>", unsafe_allow_html=True)
+                        # train kmeans model
+                        cluster_model = create_model(selected_model, num_clusters = selected_clusters)
+                        cluster_model_2 = assign_model(cluster_model)
+                        # Calculate summary statistics for each cluster
+                        cluster_summary = cluster_model_2.groupby('Cluster').agg(['count', 'mean', 'median', 'min', 'max',
+                                                                                    'std', 'var', 'sum', ('quantile_25', lambda x: x.quantile(0.25)),
+                                                                                    ('quantile_75', lambda x: x.quantile(0.75)), 'skew'])
+                        with st.expander("Cluster Summary", expanded=False):
+                            #st.header("Cluster Summary")
+                            cluster_summary
+                        with st.expander("Model Assign", expanded=False):
+                            #st.header("Assign Model")
+                            cluster_model_2
+                        # all_metrics = get_metrics()
+                        # all_metrics
+                        with st.expander("Clustering Metrics", expanded=False):
+                            #st.header("Clustering Metrics")
+                            cluster_results = pull()
+                            cluster_results
+                        with st.expander("Clustering Plots", expanded=False):
+                            if graph_select:
+                                #st.header("Clustering Plots")
+                                # plot pca cluster plot
+                                plot_model(cluster_model, plot = 'cluster', display_format = 'streamlit')
+                                if selected_model != 'ap':
+                                    plot_model(cluster_model, plot = 'tsne', display_format = 'streamlit')
+                                if selected_model not in ('ap', 'meanshift', 'dbscan', 'optics'):
+                                    plot_model(cluster_model, plot = 'elbow', display_format = 'streamlit')
+                                if selected_model not in ('ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics'):
+                                    plot_model(cluster_model, plot = 'silhouette', display_format = 'streamlit')
+                                if selected_model not in ('ap', 'sc', 'hclust', 'dbscan', 'optics', 'birch'):
+                                    plot_model(cluster_model, plot = 'distance', display_format = 'streamlit')
+                                if selected_model != 'ap':
+                                    plot_model(cluster_model, plot = 'distribution', display_format = 'streamlit')
+                        with st.expander("Feature Importance", expanded=False):
+                            # Create a Classification Model to extract feature importance
+                            if graph_select and feat_imp_select:
+                                #st.header("Feature Importance")
+                                from pycaret.classification import setup, create_model, get_config
+                                s = setup(cluster_model_2, target = 'Cluster')
+                                lr = create_model('lr')
+                                # this is how you can recreate the table
+                                feat_imp = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value' : abs(lr.coef_[0])}).sort_values(by='Value', ascending=False)
+                                # sort by feature importance value and filter top 10
+                                feat_imp = feat_imp.sort_values(by='Value', ascending=False).head(10)
+                                # Display the filtered table in Streamlit
+                                # st.dataframe(feat_imp)
+                                # Display the filtered table as a bar chart in Streamlit
+                                st.bar_chart(feat_imp.set_index('Feature'))
     elif page == "Anomaly Detection":
+        with col1:
+            st.header('Anomaly Detection')
+            st.write(
+                """
+                """
+            )
+            # import pycaret anomaly
+            from pycaret.anomaly import setup, create_model, assign_model, pull, plot_model
+            # import AnomalyExperiment
+            from pycaret.anomaly import AnomalyExperiment
+            # Display the list of CSV files
+            directory = "./"
+            all_files = os.listdir(directory)
+            # Filter files to only include CSV files
+            csv_files = [file for file in all_files if file.endswith(".csv")]
+            # Select a CSV file from the list
+            selected_csv = st.selectbox("Select a CSV file from the list", ["None"] + csv_files)
+            # Upload the CSV file
+            uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
+            # Define the unsupervised model
+            anomalymodel = ['abod', 'cluster', 'cof', 'iforest', 'histogram', 'knn', 'lof', 'svm', 'pca', 'mcd', 'sod', 'sos']
+            selected_model = st.selectbox("Choose an anomaly model", anomalymodel)
+            # Read and display the CSV file
+            if selected_csv != "None" or uploaded_file is not None:
+                if uploaded_file:
+                    try:
+                        delimiter = ','
+                        insurance_claims = pd.read_csv (uploaded_file, sep=delimiter)
+                    except ValueError:
+                        delimiter = '|'
+                        insurance_claims = pd.read_csv (uploaded_file, sep=delimiter, encoding='latin-1')
+                else:
+                    insurance_claims = pd.read_csv(selected_csv)
+                num_rows = int(insurance_claims.shape[0]*int(num_lines)/100)
+                insurance_claims_reduced = insurance_claims.head(num_rows)
+                st.write("Rows to be processed: " + str(num_rows))
+                all_columns = insurance_claims_reduced.columns.tolist()
+                selected_columns = st.multiselect("Choose columns", all_columns, default=all_columns)
+                insurance_claims_reduced = insurance_claims_reduced[selected_columns].copy()
+                with st.expander("Inference Description", expanded=True):
+                    insurance_claims_reduced.describe().T
+                with st.expander("Head Map", expanded=True):
+                    cat_col = insurance_claims_reduced.select_dtypes(include=['object']).columns
+                    num_col = insurance_claims_reduced.select_dtypes(exclude=['object']).columns
+                    # insurance_claims[num_col].hist(bins=15, figsize=(20, 15), layout=(5, 4))
+                    # Calculate the correlation matrix
+                    corr_matrix = insurance_claims_reduced[num_col].corr()
+                    # Create a Matplotlib figure
+                    fig, ax = plt.subplots(figsize=(12, 8))
+                    # Create a heatmap using seaborn
+                    #st.header("Heat Map")
+                    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=ax)
+                    # Set the title for the heatmap
+                    ax.set_title('Correlation Heatmap')
+                    # Display the heatmap in Streamlit
+                    st.pyplot(fig)
+                if st.button("Prediction"):
+                    s = setup(insurance_claims_reduced, session_id = 123, remove_multicollinearity=p_remove_multicollinearity, multicollinearity_threshold=p_multicollinearity_threshold,
+                                # remove_outliers=p_remove_outliers, outliers_method=p_outliers_method,
+                                transformation=p_transformation,
+                                normalize=p_normalize, pca=p_pca, pca_method=p_pca_method)
+                    exp_anomaly = AnomalyExperiment()
+                    # init setup on exp
+                    exp_anomaly.setup(insurance_claims_reduced, session_id = 123)
+                    with st.spinner("Analyzing..."):
+                        with col2:
+                            st.markdown("<br><br><br><br>", unsafe_allow_html=True)
+                            # train model
+                            anomaly_model = create_model(selected_model)
+                            with st.expander("Assign Model", expanded=False):
+                                #st.header("Assign Model")
+                                anomaly_model_2 = assign_model(anomaly_model)
+                                anomaly_model_2
+                            with st.expander("Anomaly Metrics", expanded=False):
+                                #st.header("Anomaly Metrics")
+                                anomaly_results = pull()
+                                anomaly_results
+                            with st.expander("Anomaly Plots", expanded=False):
+                                if graph_select:
+                                    # plot
+                                    #st.header("Anomaly Plots")
+                                    plot_model(anomaly_model, plot = 'tsne', display_format = 'streamlit')
+                                    plot_model(anomaly_model, plot = 'umap', display_format = 'streamlit')
+                            with st.expander("Feature Importance", expanded=False):
+                                if graph_select and feat_imp_select:
+                                    # Create a Classification Model to extract feature importance
+                                    #st.header("Feature Importance")
+                                    from pycaret.classification import setup, create_model, get_config
+                                    s = setup(anomaly_model_2, target = 'Anomaly')
+                                    lr = create_model('lr')
+                                    # this is how you can recreate the table
+                                    feat_imp = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value' : abs(lr.coef_[0])}).sort_values(by='Value', ascending=False)
+                                    # sort by feature importance value and filter top 10
+                                    feat_imp = feat_imp.sort_values(by='Value', ascending=False).head(10)
+                                    # Display the filtered table in Streamlit
+                                    # st.dataframe(feat_imp)
+                                    # Display the filtered table as a bar chart in Streamlit
+                                    st.bar_chart(feat_imp.set_index('Feature'))
 try:
     main()
 except Exception as e: