allantacuelwvsu's picture
update app.py
5b3cd15
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
# Load dataset
@st.cache_data()
def load_data():
df = pd.read_csv("datasets/Mall_Customers.csv")
return df
df = load_data()
df.drop(columns=["CustomerID"], inplace=True) # Drop non-essential column
le = LabelEncoder()
df["Genre"] = le.fit_transform(df["Genre"]) # Encode Gender (Male=0, Female=1)
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)
# Find optimal K
wcss = []
k_values = range(1, 11)
for k in k_values:
kmeans = KMeans(n_clusters=k, init='k-means++', random_state=1, n_init=10)
kmeans.fit(df_scaled)
wcss.append(kmeans.inertia_)
# Choose optimal K (assumed 5 based on elbow curve)
k_optimal = 5
kmeans = KMeans(n_clusters=k_optimal, init='k-means++', random_state=1, n_init=10)
kmeans.fit(df_scaled)
df['Cluster'] = kmeans.labels_
sil_score = silhouette_score(df_scaled, kmeans.labels_)
# Streamlit App
st.title("Clustering: Mall Customers Segmentation")
st.caption("Dataset: Mall_Customers.csv")
tab1, tab2, tab3 = st.tabs(["Model Performance", "Dataset", "Customer Segment Predictor"])
with tab1:
st.header("Model Performance")
st.write(f"**Silhouette Score:** {sil_score:.4f}")
fig, ax = plt.subplots()
plt.plot(k_values, wcss, marker='o', linestyle='--')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('WCSS (Within-Cluster Sum of Squares)')
plt.title('Elbow Method for Optimal K')
st.pyplot(fig)
st.subheader("Customer Segments Visualization")
fig, ax = plt.subplots()
sns.scatterplot(x=df['Annual Income (k$)'], y=df['Spending Score (1-100)'], hue=df['Cluster'], palette='viridis')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score')
plt.title('Customer Segments')
st.pyplot(fig)
st.divider()
with tab2:
st.header("Dataset")
def corr_matrix(data, title):
data = data.select_dtypes(include=["number"])
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(data.corr(), annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5, ax=ax)
ax.set_title(title)
st.pyplot(fig)
corr_matrix(df, "Correlation Matrix")
view_type = st.radio("Order:", ["Top -> Bottom", "Bottom -> Top"])
if view_type == "Top -> Bottom":
st.dataframe(df.head(len(df)))
else:
st.dataframe(df.tail(len(df)).iloc[::-1])
st.divider()
with tab3:
st.header("Customer Segment Predictor")
income = st.slider("Annual Income (k$)", int(df['Annual Income (k$)'].min()), int(df['Annual Income (k$)'].max()), int(df['Annual Income (k$)'].median()))
spending = st.slider("Spending Score (1-100)", int(df['Spending Score (1-100)'].min()), int(df['Spending Score (1-100)'].max()), int(df['Spending Score (1-100)'].median()))
age = st.slider("Age", int(df['Age'].min()), int(df['Age'].max()), int(df['Age'].median()))
gender = st.radio("Gender", ["Male", "Female"])
input_data = pd.DataFrame([[gender, age, income, spending]], columns=["Genre", "Age", "Annual Income (k$)", "Spending Score (1-100)"])
input_data["Genre"] = le.transform([gender])[0] # Encode gender
input_scaled = scaler.transform(input_data)
predicted_cluster = kmeans.predict(input_scaled)[0]
st.subheader("Predicted Customer Segment")
st.markdown(f"<h1 style='color:green;'>Cluster {predicted_cluster}</h1>", unsafe_allow_html=True)
# Graph to visualize input placement
fig, ax = plt.subplots()
sns.scatterplot(x=df['Annual Income (k$)'], y=df['Spending Score (1-100)'], hue=df['Cluster'], palette='viridis', alpha=0.6)
plt.scatter(income, spending, color='red', label='Your Input', edgecolors='black', s=100)
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score')
plt.title('Customer Segments with Your Input')
plt.legend()
st.pyplot(fig)
st.divider()