Spencer525 commited on
Commit
3972ce2
·
verified ·
1 Parent(s): 9818823

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -18
app.py CHANGED
@@ -5,8 +5,8 @@ from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
5
  from sklearn.metrics import silhouette_score
6
  from sklearn.preprocessing import StandardScaler
7
  from statsmodels.tsa.arima.model import ARIMA
8
- import matplotlib.pyplot as plt
9
- import seaborn as sns
10
 
11
  # Streamlit app title
12
  st.title('Clustering and Time Series Analysis')
@@ -23,13 +23,10 @@ if uploaded_file is not None:
23
  numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist()
24
  st.write("Numerical columns for clustering:", numerical_cols)
25
 
26
- # Option to scale data or not
27
- scale_data = st.checkbox("Scale Data", value=True)
28
- if scale_data:
29
- scaler = StandardScaler()
30
- data_scaled = scaler.fit_transform(data[numerical_cols])
31
- else:
32
- data_scaled = data[numerical_cols].values
33
 
34
  # Step 3: Clustering Algorithm Selection
35
  clustering_method = st.selectbox("Choose a clustering method", ["K-Means", "Hierarchical Clustering", "DBSCAN"])
@@ -61,11 +58,13 @@ if uploaded_file is not None:
61
  else:
62
  st.write("DBSCAN did not form valid clusters. Try adjusting eps or min_samples.")
63
 
64
- # Step 4: Visualize the clusters if valid
65
  if len(set(cluster_labels)) > 1:
66
  st.write("Cluster Labels:", np.unique(cluster_labels))
67
- sns.scatterplot(x=data_scaled[:, 0], y=data_scaled[:, 1], hue=cluster_labels, palette='Set1')
68
- st.pyplot(plt)
 
 
69
 
70
  # Step 5: ARIMA Time Series Analysis
71
  # Checking if there are any time-related columns
@@ -89,11 +88,11 @@ if uploaded_file is not None:
89
 
90
  # Display ARIMA result summary
91
  st.write(arima_result.summary())
92
-
93
- # Plotting the original and forecast
94
- fig, ax = plt.subplots()
95
- arima_result.plot_predict(dynamic=False, ax=ax)
96
- st.pyplot(fig)
97
 
98
  # Step 6: Create Silhouette Score Table for K-Means and Hierarchical Clustering
99
  st.write("### Silhouette Score Table for 2-7 Clusters")
@@ -115,4 +114,24 @@ if uploaded_file is not None:
115
  silhouette_scores['Hierarchical Silhouette Score'].append(hierarchical_silhouette)
116
 
117
  silhouette_df = pd.DataFrame(silhouette_scores)
118
- st.write(silhouette_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  from sklearn.metrics import silhouette_score
6
  from sklearn.preprocessing import StandardScaler
7
  from statsmodels.tsa.arima.model import ARIMA
8
+ import plotly.express as px
9
+ import plotly.graph_objects as go
10
 
11
  # Streamlit app title
12
  st.title('Clustering and Time Series Analysis')
 
23
  numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist()
24
  st.write("Numerical columns for clustering:", numerical_cols)
25
 
26
+ # Step 2.1: Data Standardization using StandardScaler (always applied)
27
+ scaler = StandardScaler()
28
+ data_scaled = scaler.fit_transform(data[numerical_cols])
29
+ st.write("Data has been standardized using StandardScaler.")
 
 
 
30
 
31
  # Step 3: Clustering Algorithm Selection
32
  clustering_method = st.selectbox("Choose a clustering method", ["K-Means", "Hierarchical Clustering", "DBSCAN"])
 
58
  else:
59
  st.write("DBSCAN did not form valid clusters. Try adjusting eps or min_samples.")
60
 
61
+ # Step 4: Visualize the clusters using Plotly
62
  if len(set(cluster_labels)) > 1:
63
  st.write("Cluster Labels:", np.unique(cluster_labels))
64
+
65
+ fig = px.scatter(x=data_scaled[:, 0], y=data_scaled[:, 1], color=cluster_labels, title="Clustering Results",
66
+ labels={'x': numerical_cols[0], 'y': numerical_cols[1]})
67
+ st.plotly_chart(fig)
68
 
69
  # Step 5: ARIMA Time Series Analysis
70
  # Checking if there are any time-related columns
 
88
 
89
  # Display ARIMA result summary
90
  st.write(arima_result.summary())
91
+
92
+ # Plotting the ARIMA results
93
+ fig = go.Figure()
94
+ arima_result.plot_predict(dynamic=False, ax=fig.add_subplot(1, 1, 1))
95
+ st.plotly_chart(fig)
96
 
97
  # Step 6: Create Silhouette Score Table for K-Means and Hierarchical Clustering
98
  st.write("### Silhouette Score Table for 2-7 Clusters")
 
114
  silhouette_scores['Hierarchical Silhouette Score'].append(hierarchical_silhouette)
115
 
116
  silhouette_df = pd.DataFrame(silhouette_scores)
117
+
118
+ # Plot the Silhouette Score Table using Plotly
119
+ fig = go.Figure()
120
+
121
+ # Plot K-Means Silhouette Scores
122
+ fig.add_trace(go.Scatter(x=silhouette_df['Number of Clusters'], y=silhouette_df['K-Means Silhouette Score'],
123
+ mode='lines+markers', name='K-Means Silhouette Score'))
124
+
125
+ # Plot Hierarchical Silhouette Scores
126
+ fig.add_trace(go.Scatter(x=silhouette_df['Number of Clusters'], y=silhouette_df['Hierarchical Silhouette Score'],
127
+ mode='lines+markers', name='Hierarchical Silhouette Score'))
128
+
129
+ # Set the y-axis range from -1 to 1 with intervals of 0.2
130
+ fig.update_layout(
131
+ title="Silhouette Scores for K-Means and Hierarchical Clustering",
132
+ xaxis_title="Number of Clusters",
133
+ yaxis_title="Silhouette Score",
134
+ yaxis=dict(range=[-1, 1], dtick=0.2)
135
+ )
136
+
137
+ st.plotly_chart(fig)