File size: 5,317 Bytes
3ca4800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt
import seaborn as sns

# Streamlit app title
st.title('Clustering and Time Series Analysis')

# Step 1: Upload CSV file
uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])

if uploaded_file is not None:
    data = pd.read_csv(uploaded_file)
    st.write("Dataset Preview:", data.head())

    # Step 2: Data Preprocessing
    # Selecting only numerical columns for clustering
    numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist()
    st.write("Numerical columns for clustering:", numerical_cols)

    # Option to scale data or not
    scale_data = st.checkbox("Scale Data", value=True)
    if scale_data:
        scaler = StandardScaler()
        data_scaled = scaler.fit_transform(data[numerical_cols])
    else:
        data_scaled = data[numerical_cols].values

    # Step 3: Clustering Algorithm Selection
    clustering_method = st.selectbox("Choose a clustering method", ["K-Means", "Hierarchical Clustering", "DBSCAN"])

    if clustering_method == "K-Means":
        k_range = st.slider("Select number of clusters for K-Means", min_value=2, max_value=7, value=3)
        kmeans = KMeans(n_clusters=k_range, random_state=42)
        cluster_labels = kmeans.fit_predict(data_scaled)
        silhouette_avg = silhouette_score(data_scaled, cluster_labels)
        st.write(f"K-Means Silhouette Score for {k_range} clusters: {silhouette_avg}")

    elif clustering_method == "Hierarchical Clustering":
        k_range = st.slider("Select number of clusters for Hierarchical Clustering", min_value=2, max_value=7, value=3)
        hierarchical = AgglomerativeClustering(n_clusters=k_range)
        cluster_labels = hierarchical.fit_predict(data_scaled)
        silhouette_avg = silhouette_score(data_scaled, cluster_labels)
        st.write(f"Hierarchical Clustering Silhouette Score for {k_range} clusters: {silhouette_avg}")

    elif clustering_method == "DBSCAN":
        eps_value = st.slider("Select eps value for DBSCAN", min_value=0.1, max_value=2.0, value=0.5)
        min_samples_value = st.slider("Select minimum samples for DBSCAN", min_value=1, max_value=10, value=5)
        dbscan = DBSCAN(eps=eps_value, min_samples=min_samples_value)
        cluster_labels = dbscan.fit_predict(data_scaled)
        
        # Check if DBSCAN found valid clusters
        if len(set(cluster_labels)) > 1:
            silhouette_avg = silhouette_score(data_scaled, cluster_labels)
            st.write(f"DBSCAN Silhouette Score: {silhouette_avg}")
        else:
            st.write("DBSCAN did not form valid clusters. Try adjusting eps or min_samples.")

    # Step 4: Visualize the clusters if valid
    if len(set(cluster_labels)) > 1:
        st.write("Cluster Labels:", np.unique(cluster_labels))
        sns.scatterplot(x=data_scaled[:, 0], y=data_scaled[:, 1], hue=cluster_labels, palette='Set1')
        st.pyplot(plt)

    # Step 5: ARIMA Time Series Analysis
    # Checking if there are any time-related columns
    time_series_col = None
    for col in data.columns:
        if pd.api.types.is_datetime64_any_dtype(data[col]):
            time_series_col = col
            break

    if time_series_col:
        st.write("Time Series Analysis (ARIMA) on column:", time_series_col)
        time_series_data = data[time_series_col].dropna()
        
        # ARIMA model order
        p = st.number_input("ARIMA p value", min_value=0, max_value=5, value=1)
        d = st.number_input("ARIMA d value", min_value=0, max_value=2, value=1)
        q = st.number_input("ARIMA q value", min_value=0, max_value=5, value=1)
        
        arima_model = ARIMA(time_series_data, order=(p, d, q))
        arima_result = arima_model.fit()
        
        # Display ARIMA result summary
        st.write(arima_result.summary())
        
        # Plotting the original and forecast
        fig, ax = plt.subplots()
        arima_result.plot_predict(dynamic=False, ax=ax)
        st.pyplot(fig)

    # Step 6: Create Silhouette Score Table for K-Means and Hierarchical Clustering
    st.write("### Silhouette Score Table for 2-7 Clusters")
    silhouette_scores = {'Number of Clusters': [], 'K-Means Silhouette Score': [], 'Hierarchical Silhouette Score': []}

    for n_clusters in range(2, 8):
        # K-Means
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        kmeans_labels = kmeans.fit_predict(data_scaled)
        kmeans_silhouette = silhouette_score(data_scaled, kmeans_labels)

        # Hierarchical
        hierarchical = AgglomerativeClustering(n_clusters=n_clusters)
        hierarchical_labels = hierarchical.fit_predict(data_scaled)
        hierarchical_silhouette = silhouette_score(data_scaled, hierarchical_labels)

        silhouette_scores['Number of Clusters'].append(n_clusters)
        silhouette_scores['K-Means Silhouette Score'].append(kmeans_silhouette)
        silhouette_scores['Hierarchical Silhouette Score'].append(hierarchical_silhouette)

    silhouette_df = pd.DataFrame(silhouette_scores)
    st.write(silhouette_df)