|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
from sklearn.preprocessing import StandardScaler, LabelEncoder |
|
from sklearn.cluster import KMeans |
|
from sklearn.metrics import silhouette_score |
|
|
|
|
|
@st.cache_data() |
|
def load_data(): |
|
df = pd.read_csv("datasets/Mall_Customers.csv") |
|
return df |
|
|
|
df = load_data() |
|
|
|
df.drop(columns=["CustomerID"], inplace=True) |
|
le = LabelEncoder() |
|
df["Genre"] = le.fit_transform(df["Genre"]) |
|
scaler = StandardScaler() |
|
df_scaled = scaler.fit_transform(df) |
|
|
|
|
|
wcss = [] |
|
k_values = range(1, 11) |
|
for k in k_values: |
|
kmeans = KMeans(n_clusters=k, init='k-means++', random_state=1, n_init=10) |
|
kmeans.fit(df_scaled) |
|
wcss.append(kmeans.inertia_) |
|
|
|
|
|
k_optimal = 5 |
|
kmeans = KMeans(n_clusters=k_optimal, init='k-means++', random_state=1, n_init=10) |
|
kmeans.fit(df_scaled) |
|
df['Cluster'] = kmeans.labels_ |
|
|
|
sil_score = silhouette_score(df_scaled, kmeans.labels_) |
|
|
|
|
|
st.title("Clustering: Mall Customers Segmentation") |
|
st.caption("Dataset: Mall_Customers.csv") |
|
|
|
tab1, tab2, tab3 = st.tabs(["Model Performance", "Dataset", "Customer Segment Predictor"]) |
|
|
|
with tab1: |
|
st.header("Model Performance") |
|
st.write(f"**Silhouette Score:** {sil_score:.4f}") |
|
|
|
fig, ax = plt.subplots() |
|
plt.plot(k_values, wcss, marker='o', linestyle='--') |
|
plt.xlabel('Number of Clusters (K)') |
|
plt.ylabel('WCSS (Within-Cluster Sum of Squares)') |
|
plt.title('Elbow Method for Optimal K') |
|
st.pyplot(fig) |
|
|
|
st.subheader("Customer Segments Visualization") |
|
fig, ax = plt.subplots() |
|
sns.scatterplot(x=df['Annual Income (k$)'], y=df['Spending Score (1-100)'], hue=df['Cluster'], palette='viridis') |
|
plt.xlabel('Annual Income (k$)') |
|
plt.ylabel('Spending Score') |
|
plt.title('Customer Segments') |
|
st.pyplot(fig) |
|
st.divider() |
|
|
|
with tab2: |
|
st.header("Dataset") |
|
def corr_matrix(data, title): |
|
data = data.select_dtypes(include=["number"]) |
|
fig, ax = plt.subplots(figsize=(8, 6)) |
|
sns.heatmap(data.corr(), annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5, ax=ax) |
|
ax.set_title(title) |
|
st.pyplot(fig) |
|
|
|
corr_matrix(df, "Correlation Matrix") |
|
view_type = st.radio("Order:", ["Top -> Bottom", "Bottom -> Top"]) |
|
|
|
if view_type == "Top -> Bottom": |
|
st.dataframe(df.head(len(df))) |
|
else: |
|
st.dataframe(df.tail(len(df)).iloc[::-1]) |
|
st.divider() |
|
|
|
with tab3: |
|
st.header("Customer Segment Predictor") |
|
|
|
income = st.slider("Annual Income (k$)", int(df['Annual Income (k$)'].min()), int(df['Annual Income (k$)'].max()), int(df['Annual Income (k$)'].median())) |
|
spending = st.slider("Spending Score (1-100)", int(df['Spending Score (1-100)'].min()), int(df['Spending Score (1-100)'].max()), int(df['Spending Score (1-100)'].median())) |
|
age = st.slider("Age", int(df['Age'].min()), int(df['Age'].max()), int(df['Age'].median())) |
|
gender = st.radio("Gender", ["Male", "Female"]) |
|
|
|
input_data = pd.DataFrame([[gender, age, income, spending]], columns=["Genre", "Age", "Annual Income (k$)", "Spending Score (1-100)"]) |
|
input_data["Genre"] = le.transform([gender])[0] |
|
input_scaled = scaler.transform(input_data) |
|
predicted_cluster = kmeans.predict(input_scaled)[0] |
|
|
|
st.subheader("Predicted Customer Segment") |
|
st.markdown(f"<h1 style='color:green;'>Cluster {predicted_cluster}</h1>", unsafe_allow_html=True) |
|
|
|
|
|
fig, ax = plt.subplots() |
|
sns.scatterplot(x=df['Annual Income (k$)'], y=df['Spending Score (1-100)'], hue=df['Cluster'], palette='viridis', alpha=0.6) |
|
plt.scatter(income, spending, color='red', label='Your Input', edgecolors='black', s=100) |
|
plt.xlabel('Annual Income (k$)') |
|
plt.ylabel('Spending Score') |
|
plt.title('Customer Segments with Your Input') |
|
plt.legend() |
|
st.pyplot(fig) |
|
|
|
st.divider() |