import streamlit as st |
import pandas as pd |
import numpy as np |
import matplotlib.pyplot as plt |
import seaborn as sns |
from sklearn.preprocessing import StandardScaler, LabelEncoder |
from sklearn.cluster import KMeans |
from sklearn.metrics import silhouette_score |
@st.cache_data() |
def load_data(): |
df = pd.read_csv("datasets/Mall_Customers.csv") |
return df |
df = load_data() |
df.drop(columns=["CustomerID"], inplace=True) |
le = LabelEncoder() |
df["Genre"] = le.fit_transform(df["Genre"]) |
scaler = StandardScaler() |
df_scaled = scaler.fit_transform(df) |
wcss = [] |
k_values = range(1, 11) |
for k in k_values: |
kmeans = KMeans(n_clusters=k, init='k-means++', random_state=1, n_init=10) |
kmeans.fit(df_scaled) |
wcss.append(kmeans.inertia_) |
k_optimal = 5 |
kmeans = KMeans(n_clusters=k_optimal, init='k-means++', random_state=1, n_init=10) |
kmeans.fit(df_scaled) |
df['Cluster'] = kmeans.labels_ |
sil_score = silhouette_score(df_scaled, kmeans.labels_) |
st.title("Clustering: Mall Customers Segmentation") |
st.caption("Dataset: Mall_Customers.csv") |
tab1, tab2, tab3 = st.tabs(["Model Performance", "Dataset", "Customer Segment Predictor"]) |
with tab1: |
st.header("Model Performance") |
st.write(f"**Silhouette Score:** {sil_score:.4f}") |
fig, ax = plt.subplots() |
plt.plot(k_values, wcss, marker='o', linestyle='--') |
plt.xlabel('Number of Clusters (K)') |
plt.ylabel('WCSS (Within-Cluster Sum of Squares)') |
plt.title('Elbow Method for Optimal K') |
st.pyplot(fig) |
st.subheader("Customer Segments Visualization") |
fig, ax = plt.subplots() |
sns.scatterplot(x=df['Annual Income (k$)'], y=df['Spending Score (1-100)'], hue=df['Cluster'], palette='viridis') |
plt.xlabel('Annual Income (k$)') |
plt.ylabel('Spending Score') |
plt.title('Customer Segments') |
st.pyplot(fig) |
st.divider() |
with tab2: |
st.header("Dataset") |
def corr_matrix(data, title): |
data = data.select_dtypes(include=["number"]) |
fig, ax = plt.subplots(figsize=(8, 6)) |
sns.heatmap(data.corr(), annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5, ax=ax) |
ax.set_title(title) |
st.pyplot(fig) |
corr_matrix(df, "Correlation Matrix") |
view_type = st.radio("Order:", ["Top -> Bottom", "Bottom -> Top"]) |
if view_type == "Top -> Bottom": |
st.dataframe(df.head(len(df))) |
else: |
st.dataframe(df.tail(len(df)).iloc[::-1]) |
st.divider() |
with tab3: |
st.header("Customer Segment Predictor") |
income = st.slider("Annual Income (k$)", int(df['Annual Income (k$)'].min()), int(df['Annual Income (k$)'].max()), int(df['Annual Income (k$)'].median())) |
spending = st.slider("Spending Score (1-100)", int(df['Spending Score (1-100)'].min()), int(df['Spending Score (1-100)'].max()), int(df['Spending Score (1-100)'].median())) |
age = st.slider("Age", int(df['Age'].min()), int(df['Age'].max()), int(df['Age'].median())) |
gender = st.radio("Gender", ["Male", "Female"]) |
input_data = pd.DataFrame([[gender, age, income, spending]], columns=["Genre", "Age", "Annual Income (k$)", "Spending Score (1-100)"]) |
input_data["Genre"] = le.transform([gender])[0] |
input_scaled = scaler.transform(input_data) |
predicted_cluster = kmeans.predict(input_scaled)[0] |
st.subheader("Predicted Customer Segment") |
st.markdown(f"<h1 style='color:green;'>Cluster {predicted_cluster}</h1>", unsafe_allow_html=True) |
fig, ax = plt.subplots() |
sns.scatterplot(x=df['Annual Income (k$)'], y=df['Spending Score (1-100)'], hue=df['Cluster'], palette='viridis', alpha=0.6) |
plt.scatter(income, spending, color='red', label='Your Input', edgecolors='black', s=100) |
plt.xlabel('Annual Income (k$)') |
plt.ylabel('Spending Score') |
plt.title('Customer Segments with Your Input') |
plt.legend() |
st.pyplot(fig) |
st.divider() |