Spaces:

allantacuelwvsu
/

k-means_clustering

Running

App Files Files Community

k-means_clustering / app.py

allantacuelwvsu

update app.py

5b3cd15 about 23 hours ago

raw

history blame contribute delete

4.09 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	from sklearn.preprocessing import StandardScaler, LabelEncoder
	from sklearn.cluster import KMeans
	from sklearn.metrics import silhouette_score

	# Load dataset
	@st.cache_data()
	def load_data():
	df = pd.read_csv("datasets/Mall_Customers.csv")
	return df

	df = load_data()

	df.drop(columns=["CustomerID"], inplace=True) # Drop non-essential column
	le = LabelEncoder()
	df["Genre"] = le.fit_transform(df["Genre"]) # Encode Gender (Male=0, Female=1)
	scaler = StandardScaler()
	df_scaled = scaler.fit_transform(df)

	# Find optimal K
	wcss = []
	k_values = range(1, 11)
	for k in k_values:
	kmeans = KMeans(n_clusters=k, init='k-means++', random_state=1, n_init=10)
	kmeans.fit(df_scaled)
	wcss.append(kmeans.inertia_)

	# Choose optimal K (assumed 5 based on elbow curve)
	k_optimal = 5
	kmeans = KMeans(n_clusters=k_optimal, init='k-means++', random_state=1, n_init=10)
	kmeans.fit(df_scaled)
	df['Cluster'] = kmeans.labels_

	sil_score = silhouette_score(df_scaled, kmeans.labels_)

	# Streamlit App
	st.title("Clustering: Mall Customers Segmentation")
	st.caption("Dataset: Mall_Customers.csv")

	tab1, tab2, tab3 = st.tabs(["Model Performance", "Dataset", "Customer Segment Predictor"])

	with tab1:
	st.header("Model Performance")
	st.write(f"Silhouette Score: {sil_score:.4f}")

	fig, ax = plt.subplots()
	plt.plot(k_values, wcss, marker='o', linestyle='--')
	plt.xlabel('Number of Clusters (K)')
	plt.ylabel('WCSS (Within-Cluster Sum of Squares)')
	plt.title('Elbow Method for Optimal K')
	st.pyplot(fig)

	st.subheader("Customer Segments Visualization")
	fig, ax = plt.subplots()
	sns.scatterplot(x=df['Annual Income (k$)'], y=df['Spending Score (1-100)'], hue=df['Cluster'], palette='viridis')
	plt.xlabel('Annual Income (k$)')
	plt.ylabel('Spending Score')
	plt.title('Customer Segments')
	st.pyplot(fig)
	st.divider()

	with tab2:
	st.header("Dataset")
	def corr_matrix(data, title):
	data = data.select_dtypes(include=["number"])
	fig, ax = plt.subplots(figsize=(8, 6))
	sns.heatmap(data.corr(), annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5, ax=ax)
	ax.set_title(title)
	st.pyplot(fig)

	corr_matrix(df, "Correlation Matrix")
	view_type = st.radio("Order:", ["Top -> Bottom", "Bottom -> Top"])

	if view_type == "Top -> Bottom":
	st.dataframe(df.head(len(df)))
	else:
	st.dataframe(df.tail(len(df)).iloc[::-1])
	st.divider()

	with tab3:
	st.header("Customer Segment Predictor")

	income = st.slider("Annual Income (k$)", int(df['Annual Income (k$)'].min()), int(df['Annual Income (k$)'].max()), int(df['Annual Income (k$)'].median()))
	spending = st.slider("Spending Score (1-100)", int(df['Spending Score (1-100)'].min()), int(df['Spending Score (1-100)'].max()), int(df['Spending Score (1-100)'].median()))
	age = st.slider("Age", int(df['Age'].min()), int(df['Age'].max()), int(df['Age'].median()))
	gender = st.radio("Gender", ["Male", "Female"])

	input_data = pd.DataFrame([[gender, age, income, spending]], columns=["Genre", "Age", "Annual Income (k$)", "Spending Score (1-100)"])
	input_data["Genre"] = le.transform([gender])[0] # Encode gender
	input_scaled = scaler.transform(input_data)
	predicted_cluster = kmeans.predict(input_scaled)[0]

	st.subheader("Predicted Customer Segment")
	st.markdown(f"<h1 style='color:green;'>Cluster {predicted_cluster}</h1>", unsafe_allow_html=True)

	# Graph to visualize input placement
	fig, ax = plt.subplots()
	sns.scatterplot(x=df['Annual Income (k$)'], y=df['Spending Score (1-100)'], hue=df['Cluster'], palette='viridis', alpha=0.6)
	plt.scatter(income, spending, color='red', label='Your Input', edgecolors='black', s=100)
	plt.xlabel('Annual Income (k$)')
	plt.ylabel('Spending Score')
	plt.title('Customer Segments with Your Input')
	plt.legend()
	st.pyplot(fig)

	st.divider()