File size: 5,395 Bytes
1487912
 
 
 
 
 
 
 
 
40bd51a
1487912
40bd51a
 
 
 
 
 
 
 
1487912
 
a2852d9
 
 
 
 
 
40bd51a
 
a2852d9
 
40bd51a
 
a2852d9
40bd51a
a2852d9
40bd51a
1487912
 
 
 
 
 
40bd51a
1487912
 
 
 
 
40bd51a
1487912
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40bd51a
 
 
 
1487912
40bd51a
 
 
 
7f38fb4
40bd51a
 
 
7f38fb4
40bd51a
 
 
 
 
7f38fb4
 
40bd51a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1487912
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score
import joblib
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Function to calculate VIF and filter features with VIF < 10
def calculate_vif(df):
    vif_data = pd.DataFrame()
    vif_data['feature'] = df.columns
    vif_data['VIF'] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    return vif_data[vif_data['VIF'] < 10]['feature'].tolist()

# Function to load and process data (including VIF and PCA)
def process_data(file, scaler_option):
    df = pd.read_csv(file)
    
    # Select only numeric columns for VIF calculation
    df_numeric = df.select_dtypes(include=[np.number])
    
    # Handle missing values by filling them with the mean of the respective columns
    df_numeric = df_numeric.fillna(df_numeric.mean())

    # Calculate VIF and filter features with VIF < 10
    selected_features = calculate_vif(df_numeric)

    if not selected_features:
        st.error("No features with VIF < 10 found. Please review the data.")
        return None, None

    df_filtered = df_numeric[selected_features]

    # Apply chosen scaler
    if scaler_option == 'StandardScaler':
        scaler = StandardScaler()
    elif scaler_option == 'MinMaxScaler':
        scaler = MinMaxScaler()

    scaled_data = scaler.fit_transform(df_filtered)
    
    # PCA Transformation (2 components for visualization)
    pca = PCA(n_components=2)
    pca_data = pca.fit_transform(scaled_data)
    
    return pca_data, selected_features

# Set up the Streamlit page
st.title("Clustering Analysis with K-means, Hierarchical, and DBSCAN Models")

# Upload the detectors report CSV file
data_file = st.file_uploader("Upload the detectors report file (.csv)", type="csv")

# Upload the models
kmeans_model = st.file_uploader("Upload the K-means model (.sav)", type="sav")
hierarchical_model = st.file_uploader("Upload the Hierarchical Clustering model (.sav)", type="sav")
dbscan_model = st.file_uploader("Upload the DBSCAN model (.sav)", type="sav")

# Parameter selection for K-means, Hierarchical Clustering, and DBSCAN
if data_file is not None:
    st.sidebar.header("Adjust Clustering Parameters")

    # Scaler selection
    scaler_option = st.sidebar.selectbox("Choose Scaler", ("StandardScaler", "MinMaxScaler"))

    # K-means parameters
    kmeans_clusters = st.sidebar.slider("K-means: Number of Clusters", min_value=2, max_value=10, value=3)
    
    # Hierarchical Clustering parameters
    hierarchical_clusters = st.sidebar.slider("Hierarchical: Number of Clusters", min_value=2, max_value=10, value=3)
    linkage = st.sidebar.selectbox("Hierarchical: Linkage Method", ["ward", "complete", "average", "single"])
    
    # DBSCAN parameters
    dbscan_eps = st.sidebar.number_input("DBSCAN: Epsilon", min_value=0.1, max_value=10.0, value=0.5, step=0.1)
    dbscan_min_samples = st.sidebar.slider("DBSCAN: Minimum Samples", min_value=1, max_value=20, value=5)
    
    # Load and process the data
    pca_data, selected_features = process_data(data_file, scaler_option)

    if pca_data is not None:
        st.write(f"Selected features after VIF filtering: {selected_features}")

        # Prepare the plot
        fig, ax = plt.subplots(1, 3, figsize=(15, 5))
        ax = ax.flatten()


        # K-means Clustering
        if kmeans_model is not None:
            kmeans = joblib.load(kmeans_model)
            kmeans.set_params(n_clusters=kmeans_clusters, n_init='auto')  # Set n_init='auto' for newer versions of sklearn
            kmeans_labels = kmeans.fit_predict(pca_data)
            ax[0].scatter(pca_data[:, 0], pca_data[:, 1], c=kmeans_labels, cmap='viridis')
            ax[0].set_title(f"K-means Clustering (n_clusters={kmeans_clusters})")
        else:
            ax[0].set_title("K-means Model Missing")

    
        
        # Hierarchical Clustering
        if hierarchical_model is not None:
            hierarchical = joblib.load(hierarchical_model)
            hierarchical.set_params(n_clusters=hierarchical_clusters, linkage=linkage)
            hierarchical_labels = hierarchical.fit_predict(pca_data)
            ax[1].scatter(pca_data[:, 0], pca_data[:, 1], c=hierarchical_labels, cmap='viridis')
            ax[1].set_title(f"Hierarchical Clustering (n_clusters={hierarchical_clusters}, linkage={linkage})")
        else:
            ax[1].set_title("Hierarchical Model Missing")
        
        # DBSCAN Clustering
        if dbscan_model is not None:
            dbscan = joblib.load(dbscan_model)
            dbscan.set_params(eps=dbscan_eps, min_samples=dbscan_min_samples)
            dbscan_labels = dbscan.fit_predict(pca_data)
            ax[2].scatter(pca_data[:, 0], pca_data[:, 1], c=dbscan_labels, cmap='viridis')
            ax[2].set_title(f"DBSCAN Clustering (eps={dbscan_eps}, min_samples={dbscan_min_samples})")
        else:
            ax[2].set_title("DBSCAN Model Missing")

        # Display the plots
        st.pyplot(fig)
    else:
        st.warning("Data processing failed due to VIF constraints.")
else:
    st.info("Please upload the detectors report file to proceed.")