IndSensor / app.py
Spencer525's picture
Create app.py
3ca4800 verified
raw
history blame
5.32 kB
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt
import seaborn as sns
# Streamlit app title
st.title('Clustering and Time Series Analysis')
# Step 1: Upload CSV file
uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
if uploaded_file is not None:
data = pd.read_csv(uploaded_file)
st.write("Dataset Preview:", data.head())
# Step 2: Data Preprocessing
# Selecting only numerical columns for clustering
numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist()
st.write("Numerical columns for clustering:", numerical_cols)
# Option to scale data or not
scale_data = st.checkbox("Scale Data", value=True)
if scale_data:
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data[numerical_cols])
else:
data_scaled = data[numerical_cols].values
# Step 3: Clustering Algorithm Selection
clustering_method = st.selectbox("Choose a clustering method", ["K-Means", "Hierarchical Clustering", "DBSCAN"])
if clustering_method == "K-Means":
k_range = st.slider("Select number of clusters for K-Means", min_value=2, max_value=7, value=3)
kmeans = KMeans(n_clusters=k_range, random_state=42)
cluster_labels = kmeans.fit_predict(data_scaled)
silhouette_avg = silhouette_score(data_scaled, cluster_labels)
st.write(f"K-Means Silhouette Score for {k_range} clusters: {silhouette_avg}")
elif clustering_method == "Hierarchical Clustering":
k_range = st.slider("Select number of clusters for Hierarchical Clustering", min_value=2, max_value=7, value=3)
hierarchical = AgglomerativeClustering(n_clusters=k_range)
cluster_labels = hierarchical.fit_predict(data_scaled)
silhouette_avg = silhouette_score(data_scaled, cluster_labels)
st.write(f"Hierarchical Clustering Silhouette Score for {k_range} clusters: {silhouette_avg}")
elif clustering_method == "DBSCAN":
eps_value = st.slider("Select eps value for DBSCAN", min_value=0.1, max_value=2.0, value=0.5)
min_samples_value = st.slider("Select minimum samples for DBSCAN", min_value=1, max_value=10, value=5)
dbscan = DBSCAN(eps=eps_value, min_samples=min_samples_value)
cluster_labels = dbscan.fit_predict(data_scaled)
# Check if DBSCAN found valid clusters
if len(set(cluster_labels)) > 1:
silhouette_avg = silhouette_score(data_scaled, cluster_labels)
st.write(f"DBSCAN Silhouette Score: {silhouette_avg}")
else:
st.write("DBSCAN did not form valid clusters. Try adjusting eps or min_samples.")
# Step 4: Visualize the clusters if valid
if len(set(cluster_labels)) > 1:
st.write("Cluster Labels:", np.unique(cluster_labels))
sns.scatterplot(x=data_scaled[:, 0], y=data_scaled[:, 1], hue=cluster_labels, palette='Set1')
st.pyplot(plt)
# Step 5: ARIMA Time Series Analysis
# Checking if there are any time-related columns
time_series_col = None
for col in data.columns:
if pd.api.types.is_datetime64_any_dtype(data[col]):
time_series_col = col
break
if time_series_col:
st.write("Time Series Analysis (ARIMA) on column:", time_series_col)
time_series_data = data[time_series_col].dropna()
# ARIMA model order
p = st.number_input("ARIMA p value", min_value=0, max_value=5, value=1)
d = st.number_input("ARIMA d value", min_value=0, max_value=2, value=1)
q = st.number_input("ARIMA q value", min_value=0, max_value=5, value=1)
arima_model = ARIMA(time_series_data, order=(p, d, q))
arima_result = arima_model.fit()
# Display ARIMA result summary
st.write(arima_result.summary())
# Plotting the original and forecast
fig, ax = plt.subplots()
arima_result.plot_predict(dynamic=False, ax=ax)
st.pyplot(fig)
# Step 6: Create Silhouette Score Table for K-Means and Hierarchical Clustering
st.write("### Silhouette Score Table for 2-7 Clusters")
silhouette_scores = {'Number of Clusters': [], 'K-Means Silhouette Score': [], 'Hierarchical Silhouette Score': []}
for n_clusters in range(2, 8):
# K-Means
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans_labels = kmeans.fit_predict(data_scaled)
kmeans_silhouette = silhouette_score(data_scaled, kmeans_labels)
# Hierarchical
hierarchical = AgglomerativeClustering(n_clusters=n_clusters)
hierarchical_labels = hierarchical.fit_predict(data_scaled)
hierarchical_silhouette = silhouette_score(data_scaled, hierarchical_labels)
silhouette_scores['Number of Clusters'].append(n_clusters)
silhouette_scores['K-Means Silhouette Score'].append(kmeans_silhouette)
silhouette_scores['Hierarchical Silhouette Score'].append(hierarchical_silhouette)
silhouette_df = pd.DataFrame(silhouette_scores)
st.write(silhouette_df)