Spaces:

dperales
/

Fraud_Detection_Pycaret

Runtime error

App Files Files Community

dperales commited on Apr 8, 2023

Commit

c2f025a

1 Parent(s): 09b69ad

Update app.py

Browse files

Files changed (1) hide show

app.py +258 -250

app.py CHANGED Viewed

@@ -13,259 +13,267 @@ from PIL import ImageColor
 from PIL import ImageDraw
 from PIL import ImageFont
-hide_streamlit_style = """
-            <style>
-            #MainMenu {visibility: hidden;}
-            footer {visibility: hidden;}
-            </style>
             """
-st.markdown(hide_streamlit_style, unsafe_allow_html=True)
-with st.sidebar:
-    image = Image.open('itaca_logo.png')
-    st.image(image, width=150) #,use_column_width=True)
-    page = option_menu(menu_title='Menu',
-                       menu_icon="robot",
-                       options=["Clustering Analysis",
-                                "Anomaly Detection"],
-                       icons=["chat-dots",
-                              "key"],
-                       default_index=0
-                       )
-    # Additional section below the option menu
-    # st.markdown("---")  # Add a separator line
-    st.header("Settings")
-   # Define the options for the dropdown list
-    numclusters = [2, 3, 4, 5, 6]
-    # selected_clusters = st.selectbox("Choose a number of clusters", numclusters)
-    selected_clusters = st.slider("Choose a number of clusters", min_value=2, max_value=10, value=4)
-    p_remove_multicollinearity = st.checkbox("Remove Multicollinearity", value=False)
-    p_multicollinearity_threshold = st.slider("Choose multicollinearity thresholds", min_value=0.0, max_value=1.0, value=0.9)
-    # p_remove_outliers = st.checkbox("Remove Outliers", value=False)
-    # p_outliers_method = st.selectbox ("Choose an Outlier Method", ["iforest", "ee", "lof"])
-    p_transformation = st.checkbox("Choose Power Transform", value = False)
-    p_normalize = st.checkbox("Choose Normalize", value = False)
-    p_pca = st.checkbox("Choose PCA", value = False)
-    p_pca_method = st.selectbox ("Choose a PCA Method", ["linear", "kernel", "incremental"])
-st.title('ITACA Insurance Core AI Module')
-if page == "Clustering Analysis":
-    st.header('Clustering Analysis')
-    st.write(
-        """
-        """
-    )
-    # import pycaret unsupervised models
-    from pycaret.clustering import *
-    # import ClusteringExperiment
-    from pycaret.clustering import ClusteringExperiment
-    # Display the list of CSV files
-    directory = "./"
-    all_files = os.listdir(directory)
-    # Filter files to only include CSV files
-    csv_files = [file for file in all_files if file.endswith(".csv")]
-    # Select a CSV file from the list
-    selected_csv = st.selectbox("Select a CSV file from the list", ["None"] + csv_files)
-    # Upload the CSV file
-    uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
-    # Define the unsupervised model
-    clusteringmodel = ['kmeans', 'ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics', 'birch']
-    selected_model = st.selectbox("Choose a clustering model", clusteringmodel)
-    # Read and display the CSV file
-    if selected_csv != "None" or uploaded_file is not None:
-        if uploaded_file:
-            try:
-                delimiter = ','
-                insurance_claims = pd.read_csv (uploaded_file, sep=delimiter)
-            except ValueError:
-                delimiter = '|'
-                insurance_claims = pd.read_csv (uploaded_file, sep=delimiter, encoding='latin-1')
-        else:
-            insurance_claims = pd.read_csv(selected_csv)
-        st.header("Inference Description")
-        insurance_claims.describe().T
-        cat_col = insurance_claims.select_dtypes(include=['object']).columns
-        num_col = insurance_claims.select_dtypes(exclude=['object']).columns
-        # insurance_claims[num_col].hist(bins=15, figsize=(20, 15), layout=(5, 4))
-        # Calculate the correlation matrix
-        corr_matrix = insurance_claims[num_col].corr()
-        # Create a Matplotlib figure
-        fig, ax = plt.subplots(figsize=(12, 8))
-        # Create a heatmap using seaborn
-        st.header("Heat Map")
-        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=ax)
-        # Set the title for the heatmap
-        ax.set_title('Correlation Heatmap')
-        # Display the heatmap in Streamlit
-        st.pyplot(fig)
-        all_columns = insurance_claims.columns.tolist()
-        selected_columns = st.multiselect("Choose columns", all_columns, default=all_columns)
-        if st.button("Prediction"):
-            insurance_claims = insurance_claims[selected_columns].copy()
-            s = setup(insurance_claims, session_id = 123, remove_multicollinearity=p_remove_multicollinearity, multicollinearity_threshold=p_multicollinearity_threshold,
-                    # remove_outliers=p_remove_outliers, outliers_method=p_outliers_method,
-                    transformation=p_transformation,
-                    normalize=p_normalize, pca=p_pca, pca_method=p_pca_method)
-            exp_clustering = ClusteringExperiment()
-            # init setup on exp
-            exp_clustering.setup(insurance_claims, session_id = 123)
-            with st.spinner("Analyzing..."):
-                # train kmeans model
-                cluster_model = create_model(selected_model, num_clusters = selected_clusters)
-                cluster_model_2 = assign_model(cluster_model)
-                # Calculate summary statistics for each cluster
-                cluster_summary = cluster_model_2.groupby('Cluster').agg(['count', 'mean', 'median', 'min', 'max',
-                                                                             'std', 'var', 'sum', ('quantile_25', lambda x: x.quantile(0.25)),
-                                                                             ('quantile_75', lambda x: x.quantile(0.75)), 'skew'])
-                st.header("Cluster Summary")
-                cluster_summary
-                st.header("Assign Model")
-                cluster_model_2
-                # all_metrics = get_metrics()
-                # all_metrics
-                st.header("Clustering Metrics")
-                cluster_results = pull()
-                cluster_results
-                st.header("Clustering Plots")
-                # plot pca cluster plot
-                # plot_model(cluster_model, plot = 'cluster', display_format = 'streamlit')
-                # if selected_model != 'ap':
-                #     plot_model(cluster_model, plot = 'tsne', display_format = 'streamlit')
-                # if selected_model not in ('ap', 'meanshift', 'dbscan', 'optics'):
-                #     plot_model(cluster_model, plot = 'elbow', display_format = 'streamlit')
-                # if selected_model not in ('ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics'):
-                #     plot_model(cluster_model, plot = 'silhouette', display_format = 'streamlit')
-                # if selected_model not in ('ap', 'sc', 'hclust', 'dbscan', 'optics', 'birch'):
-                #     plot_model(cluster_model, plot = 'distance', display_format = 'streamlit')
-                # if selected_model != 'ap':
-                #     plot_model(cluster_model, plot = 'distribution', display_format = 'streamlit')
-                # Create a Classification Model to extract feature importance
-                st.header("Feature Importance")
-                from pycaret.classification import *
-                s = setup(cluster_model_2, target = 'Cluster')
-                lr = create_model('lr')
-                # this is how you can recreate the table
-                print("Number of columns in X_train:", len(get_config('X_train').columns))
-                print("Number of coefficients in lr:", len(lr.coef_[0]))
-                feat_imp = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value' : abs(lr.coef_[0])}).sort_values(by='Value', ascending=False)
-                # sort by feature importance value and filter top 10
-                feat_imp = feat_imp.sort_values(by='Value', ascending=False).head(10)
-                # Display the filtered table in Streamlit
-                # st.dataframe(feat_imp)
-                # Display the filtered table as a bar chart in Streamlit
-                st.bar_chart(feat_imp.set_index('Feature'))
-elif page == "Anomaly Detection":
-    st.header('Anomaly Detection')
-    st.write(
-        """
-        """
-    )
-    # import pycaret anomaly
-    from pycaret.anomaly import *
-    # import AnomalyExperiment
-    from pycaret.anomaly import AnomalyExperiment
-    # Display the list of CSV files
-    directory = "./"
-    all_files = os.listdir(directory)
-    # Filter files to only include CSV files
-    csv_files = [file for file in all_files if file.endswith(".csv")]
-    # Select a CSV file from the list
-    selected_csv = st.selectbox("Select a CSV file from the list", ["None"] + csv_files)
-    # Upload the CSV file
-    uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
-    # Define the unsupervised model
-    anomalymodel = ['abod', 'cluster', 'cof', 'iforest', 'histogram', 'knn', 'lof', 'svm', 'pca', 'mcd', 'sod', 'sos']
-    selected_model = st.selectbox("Choose an anomaly model", anomalymodel)
-    # Read and display the CSV file
-    if selected_csv != "None" or uploaded_file is not None:
-        if uploaded_file:
-            try:
-                delimiter = ','
-                insurance_claims = pd.read_csv (uploaded_file, sep=delimiter)
-            except ValueError:
-                delimiter = '|'
-                insurance_claims = pd.read_csv (uploaded_file, sep=delimiter, encoding='latin-1')
-        else:
-            insurance_claims = pd.read_csv(selected_csv)
-        all_columns = insurance_claims.columns.tolist()
-        selected_columns = st.multiselect("Choose columns", all_columns, default=all_columns)
-        if st.button("Prediction"):
-            insurance_claims = insurance_claims[selected_columns].copy()
-            # s = setup(insurance_claims, session_id = 123)
-            s = setup(insurance_claims, session_id = 123, remove_multicollinearity=p_remove_multicollinearity, multicollinearity_threshold=p_multicollinearity_threshold,
                         # remove_outliers=p_remove_outliers, outliers_method=p_outliers_method,
                         transformation=p_transformation,
                         normalize=p_normalize, pca=p_pca, pca_method=p_pca_method)
-            exp_anomaly = AnomalyExperiment()
-            # init setup on exp
-            exp_anomaly.setup(insurance_claims, session_id = 123)
-            with st.spinner("Analyzing..."):
-                # train model
-                anomaly_model = create_model(selected_model)
-                st.header("Assign Model")
-                anomaly_model_2 = assign_model(anomaly_model)
-                anomaly_model_2
-                st.header("Anomaly Metrics")
-                anomaly_results = pull()
-                anomaly_results
-                # plot
-                st.header("Anomaly Plots")
-                plot_model(anomaly_model, plot = 'tsne', display_format = 'streamlit')
-                plot_model(anomaly_model, plot = 'umap', display_format = 'streamlit')
-                # Create a Classification Model to extract feature importance
-                st.header("Feature Importance")
-                from pycaret.classification import *
-                s = setup(anomaly_model_2, target = 'Anomaly')
-                lr = create_model('lr')
-                # this is how you can recreate the table
-                feat_imp = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value' : abs(lr.coef_[0])}).sort_values(by='Value', ascending=False)
-                # sort by feature importance value and filter top 10
-                feat_imp = feat_imp.sort_values(by='Value', ascending=False).head(10)
-                # Display the filtered table in Streamlit
-                # st.dataframe(feat_imp)
-                # Display the filtered table as a bar chart in Streamlit
-                st.bar_chart(feat_imp.set_index('Feature'))

 from PIL import ImageDraw
 from PIL import ImageFont
+def main():
+    hide_streamlit_style = """
+                <style>
+                #MainMenu {visibility: hidden;}
+                footer {visibility: hidden;}
+                </style>
+                """
+    st.markdown(hide_streamlit_style, unsafe_allow_html=True)
+    with st.sidebar:
+        image = Image.open('itaca_logo.png')
+        st.image(image, width=150) #,use_column_width=True)
+        page = option_menu(menu_title='Menu',
+                        menu_icon="robot",
+                        options=["Clustering Analysis",
+                                    "Anomaly Detection"],
+                        icons=["chat-dots",
+                                "key"],
+                        default_index=0
+                        )
+        # Additional section below the option menu
+        # st.markdown("---")  # Add a separator line
+        st.header("Settings")
+        graph_select = st.checkbox("Show Graphics", value= True)
+        feat_imp_select = st.checkbox("Feature Importance", value= False)
+    # Define the options for the dropdown list
+        numclusters = [2, 3, 4, 5, 6]
+        selected_clusters = st.slider("Choose a number of clusters", min_value=2, max_value=10, value=4)
+        p_remove_multicollinearity = st.checkbox("Remove Multicollinearity", value=False)
+        p_multicollinearity_threshold = st.slider("Choose multicollinearity thresholds", min_value=0.0, max_value=1.0, value=0.9)
+        # p_remove_outliers = st.checkbox("Remove Outliers", value=False)
+        # p_outliers_method = st.selectbox ("Choose an Outlier Method", ["iforest", "ee", "lof"])
+        p_transformation = st.checkbox("Choose Power Transform", value = False)
+        p_normalize = st.checkbox("Choose Normalize", value = False)
+        p_pca = st.checkbox("Choose PCA", value = False)
+        p_pca_method = st.selectbox ("Choose a PCA Method", ["linear", "kernel", "incremental"])
+    st.title('ITACA Insurance Core AI Module')
+    if page == "Clustering Analysis":
+        st.header('Clustering Analysis')
+        st.write(
             """
+            """
+        )
+        # import pycaret unsupervised models
+        from pycaret.clustering import setup, create_model, assign_model, pull, plot_model
+        # import ClusteringExperiment
+        from pycaret.clustering import ClusteringExperiment
+        # Display the list of CSV files
+        directory = "./"
+        all_files = os.listdir(directory)
+        # Filter files to only include CSV files
+        csv_files = [file for file in all_files if file.endswith(".csv")]
+        # Select a CSV file from the list
+        selected_csv = st.selectbox("Select a CSV file from the list", ["None"] + csv_files)
+        # Upload the CSV file
+        uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
+        # Define the unsupervised model
+        clusteringmodel = ['kmeans', 'ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics', 'birch']
+        selected_model = st.selectbox("Choose a clustering model", clusteringmodel)
+        # Read and display the CSV file
+        if selected_csv != "None" or uploaded_file is not None:
+            if uploaded_file:
+                try:
+                    delimiter = ','
+                    insurance_claims = pd.read_csv (uploaded_file, sep=delimiter)
+                except ValueError:
+                    delimiter = '|'
+                    insurance_claims = pd.read_csv (uploaded_file, sep=delimiter, encoding='latin-1')
+            else:
+                insurance_claims = pd.read_csv(selected_csv)
+            st.header("Inference Description")
+            insurance_claims.describe().T
+            cat_col = insurance_claims.select_dtypes(include=['object']).columns
+            num_col = insurance_claims.select_dtypes(exclude=['object']).columns
+            # insurance_claims[num_col].hist(bins=15, figsize=(20, 15), layout=(5, 4))
+            # Calculate the correlation matrix
+            corr_matrix = insurance_claims[num_col].corr()
+            # Create a Matplotlib figure
+            fig, ax = plt.subplots(figsize=(12, 8))
+            # Create a heatmap using seaborn
+            st.header("Heat Map")
+            sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=ax)
+            # Set the title for the heatmap
+            ax.set_title('Correlation Heatmap')
+            # Display the heatmap in Streamlit
+            st.pyplot(fig)
+            all_columns = insurance_claims.columns.tolist()
+            selected_columns = st.multiselect("Choose columns", all_columns, default=all_columns)
+            if st.button("Prediction"):
+                insurance_claims = insurance_claims[selected_columns].copy()
+                s = setup(insurance_claims, session_id = 123, remove_multicollinearity=p_remove_multicollinearity, multicollinearity_threshold=p_multicollinearity_threshold,
                         # remove_outliers=p_remove_outliers, outliers_method=p_outliers_method,
                         transformation=p_transformation,
                         normalize=p_normalize, pca=p_pca, pca_method=p_pca_method)
+                exp_clustering = ClusteringExperiment()
+                # init setup on exp
+                exp_clustering.setup(insurance_claims, session_id = 123)
+                with st.spinner("Analyzing..."):
+                    # train kmeans model
+                    cluster_model = create_model(selected_model, num_clusters = selected_clusters)
+                    cluster_model_2 = assign_model(cluster_model)
+                    # Calculate summary statistics for each cluster
+                    cluster_summary = cluster_model_2.groupby('Cluster').agg(['count', 'mean', 'median', 'min', 'max',
+                                                                                'std', 'var', 'sum', ('quantile_25', lambda x: x.quantile(0.25)),
+                                                                                ('quantile_75', lambda x: x.quantile(0.75)), 'skew'])
+                    st.header("Cluster Summary")
+                    cluster_summary
+                    st.header("Assign Model")
+                    cluster_model_2
+                    # all_metrics = get_metrics()
+                    # all_metrics
+                    st.header("Clustering Metrics")
+                    cluster_results = pull()
+                    cluster_results
+                    if graph_select:
+                        st.header("Clustering Plots")
+                        # plot pca cluster plot
+                        plot_model(cluster_model, plot = 'cluster', display_format = 'streamlit')
+                        if selected_model != 'ap':
+                            plot_model(cluster_model, plot = 'tsne', display_format = 'streamlit')
+                        if selected_model not in ('ap', 'meanshift', 'dbscan', 'optics'):
+                            plot_model(cluster_model, plot = 'elbow', display_format = 'streamlit')
+                        if selected_model not in ('ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics'):
+                            plot_model(cluster_model, plot = 'silhouette', display_format = 'streamlit')
+                        if selected_model not in ('ap', 'sc', 'hclust', 'dbscan', 'optics', 'birch'):
+                            plot_model(cluster_model, plot = 'distance', display_format = 'streamlit')
+                        if selected_model != 'ap':
+                            plot_model(cluster_model, plot = 'distribution', display_format = 'streamlit')
+                        # Create a Classification Model to extract feature importance
+                        if feat_imp_select:
+                            st.header("Feature Importance")
+                            from pycaret.classification import setup, create_model, get_config
+                            s = setup(cluster_model_2, target = 'Cluster')
+                            lr = create_model('lr')
+                            # this is how you can recreate the table
+                            feat_imp = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value' : abs(lr.coef_[0])}).sort_values(by='Value', ascending=False)
+                            # sort by feature importance value and filter top 10
+                            feat_imp = feat_imp.sort_values(by='Value', ascending=False).head(10)
+                            # Display the filtered table in Streamlit
+                            # st.dataframe(feat_imp)
+                            # Display the filtered table as a bar chart in Streamlit
+                            st.bar_chart(feat_imp.set_index('Feature'))
+    elif page == "Anomaly Detection":
+        st.header('Anomaly Detection')
+        st.write(
+            """
+            """
+        )
+        # import pycaret anomaly
+        from pycaret.anomaly import setup, create_model, assign_model, pull, plot_model
+        # import AnomalyExperiment
+        from pycaret.anomaly import AnomalyExperiment
+        # Display the list of CSV files
+        directory = "./"
+        all_files = os.listdir(directory)
+        # Filter files to only include CSV files
+        csv_files = [file for file in all_files if file.endswith(".csv")]
+        # Select a CSV file from the list
+        selected_csv = st.selectbox("Select a CSV file from the list", ["None"] + csv_files)
+        # Upload the CSV file
+        uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
+        # Define the unsupervised model
+        anomalymodel = ['abod', 'cluster', 'cof', 'iforest', 'histogram', 'knn', 'lof', 'svm', 'pca', 'mcd', 'sod', 'sos']
+        selected_model = st.selectbox("Choose an anomaly model", anomalymodel)
+        # Read and display the CSV file
+        if selected_csv != "None" or uploaded_file is not None:
+            if uploaded_file:
+                try:
+                    delimiter = ','
+                    insurance_claims = pd.read_csv (uploaded_file, sep=delimiter)
+                except ValueError:
+                    delimiter = '|'
+                    insurance_claims = pd.read_csv (uploaded_file, sep=delimiter, encoding='latin-1')
+            else:
+                insurance_claims = pd.read_csv(selected_csv)
+            all_columns = insurance_claims.columns.tolist()
+            selected_columns = st.multiselect("Choose columns", all_columns, default=all_columns)
+            if st.button("Prediction"):
+                insurance_claims = insurance_claims[selected_columns].copy()
+                s = setup(insurance_claims, session_id = 123, remove_multicollinearity=p_remove_multicollinearity, multicollinearity_threshold=p_multicollinearity_threshold,
+                            # remove_outliers=p_remove_outliers, outliers_method=p_outliers_method,
+                            transformation=p_transformation,
+                            normalize=p_normalize, pca=p_pca, pca_method=p_pca_method)
+                exp_anomaly = AnomalyExperiment()
+                # init setup on exp
+                exp_anomaly.setup(insurance_claims, session_id = 123)
+                with st.spinner("Analyzing..."):
+                    # train model
+                    anomaly_model = create_model(selected_model)
+                    st.header("Assign Model")
+                    anomaly_model_2 = assign_model(anomaly_model)
+                    anomaly_model_2
+                    st.header("Anomaly Metrics")
+                    anomaly_results = pull()
+                    anomaly_results
+                    if graph_select:
+                        # plot
+                        st.header("Anomaly Plots")
+                        plot_model(anomaly_model, plot = 'tsne', display_format = 'streamlit')
+                        plot_model(anomaly_model, plot = 'umap', display_format = 'streamlit')
+                        if feat_imp_select:
+                            # Create a Classification Model to extract feature importance
+                            st.header("Feature Importance")
+                            from pycaret.classification import setup, create_model, get_config
+                            s = setup(anomaly_model_2, target = 'Anomaly')
+                            lr = create_model('lr')
+                            # this is how you can recreate the table
+                            feat_imp = pd.DataFrame({'Feature': get_config('X_train').columns, 'Value' : abs(lr.coef_[0])}).sort_values(by='Value', ascending=False)
+                            # sort by feature importance value and filter top 10
+                            feat_imp = feat_imp.sort_values(by='Value', ascending=False).head(10)
+                            # Display the filtered table in Streamlit
+                            # st.dataframe(feat_imp)
+                            # Display the filtered table as a bar chart in Streamlit
+                            st.bar_chart(feat_imp.set_index('Feature'))
+try:
+    main()
+except Exception as e:
+    st.error(f"An error occurred: {e}")