Spaces:
Sleeping
Sleeping
File size: 5,317 Bytes
3ca4800 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt
import seaborn as sns
# Streamlit app title
st.title('Clustering and Time Series Analysis')
# Step 1: Upload CSV file
uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
if uploaded_file is not None:
data = pd.read_csv(uploaded_file)
st.write("Dataset Preview:", data.head())
# Step 2: Data Preprocessing
# Selecting only numerical columns for clustering
numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist()
st.write("Numerical columns for clustering:", numerical_cols)
# Option to scale data or not
scale_data = st.checkbox("Scale Data", value=True)
if scale_data:
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data[numerical_cols])
else:
data_scaled = data[numerical_cols].values
# Step 3: Clustering Algorithm Selection
clustering_method = st.selectbox("Choose a clustering method", ["K-Means", "Hierarchical Clustering", "DBSCAN"])
if clustering_method == "K-Means":
k_range = st.slider("Select number of clusters for K-Means", min_value=2, max_value=7, value=3)
kmeans = KMeans(n_clusters=k_range, random_state=42)
cluster_labels = kmeans.fit_predict(data_scaled)
silhouette_avg = silhouette_score(data_scaled, cluster_labels)
st.write(f"K-Means Silhouette Score for {k_range} clusters: {silhouette_avg}")
elif clustering_method == "Hierarchical Clustering":
k_range = st.slider("Select number of clusters for Hierarchical Clustering", min_value=2, max_value=7, value=3)
hierarchical = AgglomerativeClustering(n_clusters=k_range)
cluster_labels = hierarchical.fit_predict(data_scaled)
silhouette_avg = silhouette_score(data_scaled, cluster_labels)
st.write(f"Hierarchical Clustering Silhouette Score for {k_range} clusters: {silhouette_avg}")
elif clustering_method == "DBSCAN":
eps_value = st.slider("Select eps value for DBSCAN", min_value=0.1, max_value=2.0, value=0.5)
min_samples_value = st.slider("Select minimum samples for DBSCAN", min_value=1, max_value=10, value=5)
dbscan = DBSCAN(eps=eps_value, min_samples=min_samples_value)
cluster_labels = dbscan.fit_predict(data_scaled)
# Check if DBSCAN found valid clusters
if len(set(cluster_labels)) > 1:
silhouette_avg = silhouette_score(data_scaled, cluster_labels)
st.write(f"DBSCAN Silhouette Score: {silhouette_avg}")
else:
st.write("DBSCAN did not form valid clusters. Try adjusting eps or min_samples.")
# Step 4: Visualize the clusters if valid
if len(set(cluster_labels)) > 1:
st.write("Cluster Labels:", np.unique(cluster_labels))
sns.scatterplot(x=data_scaled[:, 0], y=data_scaled[:, 1], hue=cluster_labels, palette='Set1')
st.pyplot(plt)
# Step 5: ARIMA Time Series Analysis
# Checking if there are any time-related columns
time_series_col = None
for col in data.columns:
if pd.api.types.is_datetime64_any_dtype(data[col]):
time_series_col = col
break
if time_series_col:
st.write("Time Series Analysis (ARIMA) on column:", time_series_col)
time_series_data = data[time_series_col].dropna()
# ARIMA model order
p = st.number_input("ARIMA p value", min_value=0, max_value=5, value=1)
d = st.number_input("ARIMA d value", min_value=0, max_value=2, value=1)
q = st.number_input("ARIMA q value", min_value=0, max_value=5, value=1)
arima_model = ARIMA(time_series_data, order=(p, d, q))
arima_result = arima_model.fit()
# Display ARIMA result summary
st.write(arima_result.summary())
# Plotting the original and forecast
fig, ax = plt.subplots()
arima_result.plot_predict(dynamic=False, ax=ax)
st.pyplot(fig)
# Step 6: Create Silhouette Score Table for K-Means and Hierarchical Clustering
st.write("### Silhouette Score Table for 2-7 Clusters")
silhouette_scores = {'Number of Clusters': [], 'K-Means Silhouette Score': [], 'Hierarchical Silhouette Score': []}
for n_clusters in range(2, 8):
# K-Means
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans_labels = kmeans.fit_predict(data_scaled)
kmeans_silhouette = silhouette_score(data_scaled, kmeans_labels)
# Hierarchical
hierarchical = AgglomerativeClustering(n_clusters=n_clusters)
hierarchical_labels = hierarchical.fit_predict(data_scaled)
hierarchical_silhouette = silhouette_score(data_scaled, hierarchical_labels)
silhouette_scores['Number of Clusters'].append(n_clusters)
silhouette_scores['K-Means Silhouette Score'].append(kmeans_silhouette)
silhouette_scores['Hierarchical Silhouette Score'].append(hierarchical_silhouette)
silhouette_df = pd.DataFrame(silhouette_scores)
st.write(silhouette_df)
|