Spaces:
Sleeping
Sleeping
File size: 6,543 Bytes
3ca4800 3972ce2 3ca4800 3972ce2 3ca4800 3972ce2 3ca4800 3972ce2 b8f81ca 3972ce2 b8f81ca 3972ce2 3ca4800 3972ce2 3ca4800 3972ce2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.arima.model import ARIMA
import plotly.express as px
import plotly.graph_objects as go
# Streamlit app title
st.title('Clustering and Time Series Analysis')
# Step 1: Upload CSV file
uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
if uploaded_file is not None:
data = pd.read_csv(uploaded_file)
st.write("Dataset Preview:", data.head())
# Step 2: Data Preprocessing
# Selecting only numerical columns for clustering
numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist()
st.write("Numerical columns for clustering:", numerical_cols)
# Step 2.1: Data Standardization using StandardScaler (always applied)
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data[numerical_cols])
st.write("Data has been standardized using StandardScaler.")
# Step 3: Clustering Algorithm Selection
clustering_method = st.selectbox("Choose a clustering method", ["K-Means", "Hierarchical Clustering", "DBSCAN"])
if clustering_method == "K-Means":
k_range = st.slider("Select number of clusters for K-Means", min_value=2, max_value=7, value=3)
kmeans = KMeans(n_clusters=k_range, random_state=42)
cluster_labels = kmeans.fit_predict(data_scaled)
silhouette_avg = silhouette_score(data_scaled, cluster_labels)
st.write(f"K-Means Silhouette Score for {k_range} clusters: {silhouette_avg}")
elif clustering_method == "Hierarchical Clustering":
k_range = st.slider("Select number of clusters for Hierarchical Clustering", min_value=2, max_value=7, value=3)
hierarchical = AgglomerativeClustering(n_clusters=k_range)
cluster_labels = hierarchical.fit_predict(data_scaled)
silhouette_avg = silhouette_score(data_scaled, cluster_labels)
st.write(f"Hierarchical Clustering Silhouette Score for {k_range} clusters: {silhouette_avg}")
elif clustering_method == "DBSCAN":
eps_value = st.slider("Select eps value for DBSCAN", min_value=0.1, max_value=2.0, value=0.5)
min_samples_value = st.slider("Select minimum samples for DBSCAN", min_value=1, max_value=10, value=5)
dbscan = DBSCAN(eps=eps_value, min_samples=min_samples_value)
cluster_labels = dbscan.fit_predict(data_scaled)
# Check if DBSCAN found valid clusters
if len(set(cluster_labels)) > 1:
silhouette_avg = silhouette_score(data_scaled, cluster_labels)
st.write(f"DBSCAN Silhouette Score: {silhouette_avg}")
else:
st.write("DBSCAN did not form valid clusters. Try adjusting eps or min_samples.")
# Step 4: Visualize the clusters using Plotly
if len(set(cluster_labels)) > 1:
st.write("Cluster Labels:", np.unique(cluster_labels))
# Create Plotly scatter plot
fig = px.scatter(x=data_scaled[:, 0], y=data_scaled[:, 1], color=cluster_labels, title="Clustering Results",
labels={'x': numerical_cols[0], 'y': numerical_cols[1]})
# Update y-axis range to be from -1 to 1 with 0.2 intervals
fig.update_layout(
yaxis=dict(range=[-1, 1], dtick=0.2),
xaxis_title=numerical_cols[0],
yaxis_title=numerical_cols[1]
)
st.plotly_chart(fig)
# Step 5: ARIMA Time Series Analysis
# Checking if there are any time-related columns
time_series_col = None
for col in data.columns:
if pd.api.types.is_datetime64_any_dtype(data[col]):
time_series_col = col
break
if time_series_col:
st.write("Time Series Analysis (ARIMA) on column:", time_series_col)
time_series_data = data[time_series_col].dropna()
# ARIMA model order
p = st.number_input("ARIMA p value", min_value=0, max_value=5, value=1)
d = st.number_input("ARIMA d value", min_value=0, max_value=2, value=1)
q = st.number_input("ARIMA q value", min_value=0, max_value=5, value=1)
arima_model = ARIMA(time_series_data, order=(p, d, q))
arima_result = arima_model.fit()
# Display ARIMA result summary
st.write(arima_result.summary())
# Plotting the ARIMA results
fig = go.Figure()
arima_result.plot_predict(dynamic=False, ax=fig.add_subplot(1, 1, 1))
st.plotly_chart(fig)
# Step 6: Create Silhouette Score Table for K-Means and Hierarchical Clustering
st.write("### Silhouette Score Table for 2-7 Clusters")
silhouette_scores = {'Number of Clusters': [], 'K-Means Silhouette Score': [], 'Hierarchical Silhouette Score': []}
for n_clusters in range(2, 8):
# K-Means
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans_labels = kmeans.fit_predict(data_scaled)
kmeans_silhouette = silhouette_score(data_scaled, kmeans_labels)
# Hierarchical
hierarchical = AgglomerativeClustering(n_clusters=n_clusters)
hierarchical_labels = hierarchical.fit_predict(data_scaled)
hierarchical_silhouette = silhouette_score(data_scaled, hierarchical_labels)
silhouette_scores['Number of Clusters'].append(n_clusters)
silhouette_scores['K-Means Silhouette Score'].append(kmeans_silhouette)
silhouette_scores['Hierarchical Silhouette Score'].append(hierarchical_silhouette)
silhouette_df = pd.DataFrame(silhouette_scores)
# Plot the Silhouette Score Table using Plotly
fig = go.Figure()
# Plot K-Means Silhouette Scores
fig.add_trace(go.Scatter(x=silhouette_df['Number of Clusters'], y=silhouette_df['K-Means Silhouette Score'],
mode='lines+markers', name='K-Means Silhouette Score'))
# Plot Hierarchical Silhouette Scores
fig.add_trace(go.Scatter(x=silhouette_df['Number of Clusters'], y=silhouette_df['Hierarchical Silhouette Score'],
mode='lines+markers', name='Hierarchical Silhouette Score'))
# Set the y-axis range from -1 to 1 with intervals of 0.2
fig.update_layout(
title="Silhouette Scores for K-Means and Hierarchical Clustering",
xaxis_title="Number of Clusters",
yaxis_title="Silhouette Score",
yaxis=dict(range=[-1, 1], dtick=0.2)
)
st.plotly_chart(fig)
|