File size: 6,543 Bytes
3ca4800
 
 
 
 
 
 
3972ce2
 
3ca4800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3972ce2
 
 
 
3ca4800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3972ce2
3ca4800
 
3972ce2
b8f81ca
3972ce2
 
b8f81ca
 
 
 
 
 
 
 
3972ce2
3ca4800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3972ce2
 
 
 
 
3ca4800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3972ce2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.arima.model import ARIMA
import plotly.express as px
import plotly.graph_objects as go

# Streamlit app title
st.title('Clustering and Time Series Analysis')

# Step 1: Upload CSV file
uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])

if uploaded_file is not None:
    data = pd.read_csv(uploaded_file)
    st.write("Dataset Preview:", data.head())

    # Step 2: Data Preprocessing
    # Selecting only numerical columns for clustering
    numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist()
    st.write("Numerical columns for clustering:", numerical_cols)

    # Step 2.1: Data Standardization using StandardScaler (always applied)
    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(data[numerical_cols])
    st.write("Data has been standardized using StandardScaler.")

    # Step 3: Clustering Algorithm Selection
    clustering_method = st.selectbox("Choose a clustering method", ["K-Means", "Hierarchical Clustering", "DBSCAN"])

    if clustering_method == "K-Means":
        k_range = st.slider("Select number of clusters for K-Means", min_value=2, max_value=7, value=3)
        kmeans = KMeans(n_clusters=k_range, random_state=42)
        cluster_labels = kmeans.fit_predict(data_scaled)
        silhouette_avg = silhouette_score(data_scaled, cluster_labels)
        st.write(f"K-Means Silhouette Score for {k_range} clusters: {silhouette_avg}")

    elif clustering_method == "Hierarchical Clustering":
        k_range = st.slider("Select number of clusters for Hierarchical Clustering", min_value=2, max_value=7, value=3)
        hierarchical = AgglomerativeClustering(n_clusters=k_range)
        cluster_labels = hierarchical.fit_predict(data_scaled)
        silhouette_avg = silhouette_score(data_scaled, cluster_labels)
        st.write(f"Hierarchical Clustering Silhouette Score for {k_range} clusters: {silhouette_avg}")

    elif clustering_method == "DBSCAN":
        eps_value = st.slider("Select eps value for DBSCAN", min_value=0.1, max_value=2.0, value=0.5)
        min_samples_value = st.slider("Select minimum samples for DBSCAN", min_value=1, max_value=10, value=5)
        dbscan = DBSCAN(eps=eps_value, min_samples=min_samples_value)
        cluster_labels = dbscan.fit_predict(data_scaled)
        
        # Check if DBSCAN found valid clusters
        if len(set(cluster_labels)) > 1:
            silhouette_avg = silhouette_score(data_scaled, cluster_labels)
            st.write(f"DBSCAN Silhouette Score: {silhouette_avg}")
        else:
            st.write("DBSCAN did not form valid clusters. Try adjusting eps or min_samples.")

    # Step 4: Visualize the clusters using Plotly
    if len(set(cluster_labels)) > 1:
        st.write("Cluster Labels:", np.unique(cluster_labels))
        
        # Create Plotly scatter plot
        fig = px.scatter(x=data_scaled[:, 0], y=data_scaled[:, 1], color=cluster_labels, title="Clustering Results", 
                         labels={'x': numerical_cols[0], 'y': numerical_cols[1]})

        # Update y-axis range to be from -1 to 1 with 0.2 intervals
        fig.update_layout(
            yaxis=dict(range=[-1, 1], dtick=0.2),
            xaxis_title=numerical_cols[0],
            yaxis_title=numerical_cols[1]
        )

        st.plotly_chart(fig)

    # Step 5: ARIMA Time Series Analysis
    # Checking if there are any time-related columns
    time_series_col = None
    for col in data.columns:
        if pd.api.types.is_datetime64_any_dtype(data[col]):
            time_series_col = col
            break

    if time_series_col:
        st.write("Time Series Analysis (ARIMA) on column:", time_series_col)
        time_series_data = data[time_series_col].dropna()
        
        # ARIMA model order
        p = st.number_input("ARIMA p value", min_value=0, max_value=5, value=1)
        d = st.number_input("ARIMA d value", min_value=0, max_value=2, value=1)
        q = st.number_input("ARIMA q value", min_value=0, max_value=5, value=1)
        
        arima_model = ARIMA(time_series_data, order=(p, d, q))
        arima_result = arima_model.fit()
        
        # Display ARIMA result summary
        st.write(arima_result.summary())

        # Plotting the ARIMA results
        fig = go.Figure()
        arima_result.plot_predict(dynamic=False, ax=fig.add_subplot(1, 1, 1))
        st.plotly_chart(fig)

    # Step 6: Create Silhouette Score Table for K-Means and Hierarchical Clustering
    st.write("### Silhouette Score Table for 2-7 Clusters")
    silhouette_scores = {'Number of Clusters': [], 'K-Means Silhouette Score': [], 'Hierarchical Silhouette Score': []}

    for n_clusters in range(2, 8):
        # K-Means
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        kmeans_labels = kmeans.fit_predict(data_scaled)
        kmeans_silhouette = silhouette_score(data_scaled, kmeans_labels)

        # Hierarchical
        hierarchical = AgglomerativeClustering(n_clusters=n_clusters)
        hierarchical_labels = hierarchical.fit_predict(data_scaled)
        hierarchical_silhouette = silhouette_score(data_scaled, hierarchical_labels)

        silhouette_scores['Number of Clusters'].append(n_clusters)
        silhouette_scores['K-Means Silhouette Score'].append(kmeans_silhouette)
        silhouette_scores['Hierarchical Silhouette Score'].append(hierarchical_silhouette)

    silhouette_df = pd.DataFrame(silhouette_scores)

    # Plot the Silhouette Score Table using Plotly
    fig = go.Figure()

    # Plot K-Means Silhouette Scores
    fig.add_trace(go.Scatter(x=silhouette_df['Number of Clusters'], y=silhouette_df['K-Means Silhouette Score'],
                             mode='lines+markers', name='K-Means Silhouette Score'))

    # Plot Hierarchical Silhouette Scores
    fig.add_trace(go.Scatter(x=silhouette_df['Number of Clusters'], y=silhouette_df['Hierarchical Silhouette Score'],
                             mode='lines+markers', name='Hierarchical Silhouette Score'))

    # Set the y-axis range from -1 to 1 with intervals of 0.2
    fig.update_layout(
        title="Silhouette Scores for K-Means and Hierarchical Clustering",
        xaxis_title="Number of Clusters",
        yaxis_title="Silhouette Score",
        yaxis=dict(range=[-1, 1], dtick=0.2)
    )

    st.plotly_chart(fig)