import streamlit as st import pandas as pd import numpy as np from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering from sklearn.metrics import silhouette_score from sklearn.preprocessing import StandardScaler from statsmodels.tsa.arima.model import ARIMA import matplotlib.pyplot as plt import seaborn as sns # Streamlit app title st.title('Clustering and Time Series Analysis') # Step 1: Upload CSV file uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"]) if uploaded_file is not None: data = pd.read_csv(uploaded_file) st.write("Dataset Preview:", data.head()) # Step 2: Data Preprocessing # Selecting only numerical columns for clustering numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist() st.write("Numerical columns for clustering:", numerical_cols) # Option to scale data or not scale_data = st.checkbox("Scale Data", value=True) if scale_data: scaler = StandardScaler() data_scaled = scaler.fit_transform(data[numerical_cols]) else: data_scaled = data[numerical_cols].values # Step 3: Clustering Algorithm Selection clustering_method = st.selectbox("Choose a clustering method", ["K-Means", "Hierarchical Clustering", "DBSCAN"]) if clustering_method == "K-Means": k_range = st.slider("Select number of clusters for K-Means", min_value=2, max_value=7, value=3) kmeans = KMeans(n_clusters=k_range, random_state=42) cluster_labels = kmeans.fit_predict(data_scaled) silhouette_avg = silhouette_score(data_scaled, cluster_labels) st.write(f"K-Means Silhouette Score for {k_range} clusters: {silhouette_avg}") elif clustering_method == "Hierarchical Clustering": k_range = st.slider("Select number of clusters for Hierarchical Clustering", min_value=2, max_value=7, value=3) hierarchical = AgglomerativeClustering(n_clusters=k_range) cluster_labels = hierarchical.fit_predict(data_scaled) silhouette_avg = silhouette_score(data_scaled, cluster_labels) st.write(f"Hierarchical Clustering Silhouette Score for {k_range} clusters: {silhouette_avg}") elif clustering_method == "DBSCAN": eps_value = st.slider("Select eps value for DBSCAN", min_value=0.1, max_value=2.0, value=0.5) min_samples_value = st.slider("Select minimum samples for DBSCAN", min_value=1, max_value=10, value=5) dbscan = DBSCAN(eps=eps_value, min_samples=min_samples_value) cluster_labels = dbscan.fit_predict(data_scaled) # Check if DBSCAN found valid clusters if len(set(cluster_labels)) > 1: silhouette_avg = silhouette_score(data_scaled, cluster_labels) st.write(f"DBSCAN Silhouette Score: {silhouette_avg}") else: st.write("DBSCAN did not form valid clusters. Try adjusting eps or min_samples.") # Step 4: Visualize the clusters if valid if len(set(cluster_labels)) > 1: st.write("Cluster Labels:", np.unique(cluster_labels)) sns.scatterplot(x=data_scaled[:, 0], y=data_scaled[:, 1], hue=cluster_labels, palette='Set1') st.pyplot(plt) # Step 5: ARIMA Time Series Analysis # Checking if there are any time-related columns time_series_col = None for col in data.columns: if pd.api.types.is_datetime64_any_dtype(data[col]): time_series_col = col break if time_series_col: st.write("Time Series Analysis (ARIMA) on column:", time_series_col) time_series_data = data[time_series_col].dropna() # ARIMA model order p = st.number_input("ARIMA p value", min_value=0, max_value=5, value=1) d = st.number_input("ARIMA d value", min_value=0, max_value=2, value=1) q = st.number_input("ARIMA q value", min_value=0, max_value=5, value=1) arima_model = ARIMA(time_series_data, order=(p, d, q)) arima_result = arima_model.fit() # Display ARIMA result summary st.write(arima_result.summary()) # Plotting the original and forecast fig, ax = plt.subplots() arima_result.plot_predict(dynamic=False, ax=ax) st.pyplot(fig) # Step 6: Create Silhouette Score Table for K-Means and Hierarchical Clustering st.write("### Silhouette Score Table for 2-7 Clusters") silhouette_scores = {'Number of Clusters': [], 'K-Means Silhouette Score': [], 'Hierarchical Silhouette Score': []} for n_clusters in range(2, 8): # K-Means kmeans = KMeans(n_clusters=n_clusters, random_state=42) kmeans_labels = kmeans.fit_predict(data_scaled) kmeans_silhouette = silhouette_score(data_scaled, kmeans_labels) # Hierarchical hierarchical = AgglomerativeClustering(n_clusters=n_clusters) hierarchical_labels = hierarchical.fit_predict(data_scaled) hierarchical_silhouette = silhouette_score(data_scaled, hierarchical_labels) silhouette_scores['Number of Clusters'].append(n_clusters) silhouette_scores['K-Means Silhouette Score'].append(kmeans_silhouette) silhouette_scores['Hierarchical Silhouette Score'].append(hierarchical_silhouette) silhouette_df = pd.DataFrame(silhouette_scores) st.write(silhouette_df)