Spaces:

SURESHBEEKHANI
/

Customer-Segmentation

Sleeping

App Files Files Community

SURESHBEEKHANI commited on Apr 14

Commit

874b2d8

verified ·

1 Parent(s): dfe0194

Upload 6 files

Browse files

Files changed (6) hide show

app.py +85 -0
notebook/Customer_Segmentation_using_K_Means_Clustering.ipynb +0 -0
requirements.txt +6 -0
src/__pycache__/utils.cpython-310.pyc +0 -0
src/clustering.py +70 -0
src/utils.py +80 -0

app.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import streamlit as st
+import matplotlib.pyplot as plt
+from src.clustering import load_data, extract_features, fit_kmeans, calculate_wcss
+from src.utils import plot_cluster_counts, visualize_clusters
+from typing import List
+# Page configuration
+st.set_page_config(
+    page_title="Customer Segmentation",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Sidebar styling via markdown (optional)
+st.markdown(
+    """
+    <style>
+    .reportview-container { padding: 2rem; }
+    .sidebar .sidebar-content { background-color: #ffffff; padding: 1.5rem; border-radius: 8px; }
+    </style>
+    """,
+    unsafe_allow_html=True
+)
+# App title
+st.title("📊 Customer Segmentation")
+# File upload
+uploaded_file = st.sidebar.file_uploader("Upload CSV file", type=["csv"])
+if not uploaded_file:
+    st.sidebar.info("Please upload a CSV file to proceed.")
+    st.stop()
+# Load data
+data = load_data(uploaded_file)
+st.subheader("Dataset Preview")
+st.dataframe(data.head())
+# Select features
+feature_options: List[str] = list(data.columns)
+selected_features = st.sidebar.multiselect(
+    "Select two features for clustering:",
+    options=feature_options,
+    default=feature_options[3:5]
+)
+if len(selected_features) != 2:
+    st.sidebar.error("Please select exactly two features.")
+    st.stop()
+# Clustering settings
+n_clusters = st.sidebar.slider(
+    "Number of clusters", min_value=2, max_value=10, value=5
+)
+# Run clustering
+if st.sidebar.button("Run Clustering"):
+    # Extract features
+    X = extract_features(data, selected_features)
+    # Compute elbow
+    wcss = calculate_wcss(X, max_clusters=10)
+    fig_elbow, ax = plt.subplots(figsize=(8, 4))
+    ax.plot(range(1, len(wcss) + 1), wcss, marker='o')
+    ax.set_title("Elbow Method: WCSS vs. Number of Clusters", fontsize=14)
+    ax.set_xlabel("Number of Clusters", fontsize=12)
+    ax.set_ylabel("WCSS", fontsize=12)
+    ax.grid(True, linestyle="--", alpha=0.6)
+    st.subheader("Elbow Method")
+    st.pyplot(fig_elbow)
+    # Fit KMeans
+    labels, centers = fit_kmeans(X, n_clusters)
+    data['Cluster'] = labels
+    # Cluster visualization
+    st.subheader("Cluster Plot")
+    fig_clusters = visualize_clusters(X, labels, centers)
+    st.pyplot(fig_clusters)
+    # Cluster counts
+    st.subheader("Cluster Size Distribution")
+    fig_counts = plot_cluster_counts(labels)
+    st.pyplot(fig_counts)
+    st.success("Clustering completed!")

notebook/Customer_Segmentation_using_K_Means_Clustering.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+streamlit
+pandas
+numpy
+matplotlib
+seaborn
+scikit-learn

src/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (2.41 kB). View file

src/clustering.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import pandas as pd
+import numpy as np
+from sklearn.cluster import KMeans
+from typing import Tuple, List
+def load_data(filepath: str) -> pd.DataFrame:
+    """
+    Load dataset from a CSV file.
+    Args:
+        filepath: Path to the CSV file.
+    Returns:
+        Pandas DataFrame.
+    """
+    return pd.read_csv(filepath)
+def extract_features(df: pd.DataFrame, feature_cols: List[str]) -> np.ndarray:
+    """
+    Extract numeric feature matrix from DataFrame.
+    Args:
+        df: Input DataFrame.
+        feature_cols: List of column names to use as features.
+    Returns:
+        2D NumPy array of features.
+    """
+    return df[feature_cols].to_numpy()
+def fit_kmeans(
+    X: np.ndarray,
+    n_clusters: int,
+    random_state: int = 42
+) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Fit KMeans and return labels and centroids.
+    Args:
+        X: Feature matrix.
+        n_clusters: Number of clusters.
+        random_state: Random seed for reproducibility.
+    Returns:
+        Tuple of (labels array, centers array).
+    """
+    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
+    labels = kmeans.fit_predict(X)
+    return labels, kmeans.cluster_centers_
+def calculate_wcss(
+    X: np.ndarray,
+    max_clusters: int = 10
+) -> List[float]:
+    """
+    Compute within-cluster sum of squares for 1..max_clusters.
+    Args:
+        X: Feature matrix.
+        max_clusters: Maximum number of clusters to evaluate.
+    Returns:
+        List of inertia values.
+    """
+    wcss = []
+    for k in range(1, max_clusters + 1):
+        kmeans = KMeans(n_clusters=k, random_state=42)
+        kmeans.fit(X)
+        wcss.append(kmeans.inertia_)
+    return wcss

src/utils.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from collections.abc import Sequence
+from matplotlib.figure import Figure
+def plot_cluster_counts(labels: Sequence[int]) -> Figure:
+    """
+    Generate a bar chart showing the number of samples in each cluster.
+    Args:
+        labels: Sequence of integer cluster labels.
+    Returns:
+        Matplotlib Figure with cluster size distribution.
+    """
+    # Count and sort cluster sizes
+    counts = pd.Series(labels).value_counts().sort_index()
+    # Create bar chart
+    fig, ax = plt.subplots(figsize=(8, 5))
+    ax.bar(counts.index.astype(str), counts.values, edgecolor="black")
+    ax.set_title("Cluster Size Distribution", fontsize=14, fontweight="bold")
+    ax.set_xlabel("Cluster Label", fontsize=12)
+    ax.set_ylabel("Number of Samples", fontsize=12)
+    ax.grid(axis="y", linestyle="--", alpha=0.6)
+    plt.tight_layout()
+    return fig
+def visualize_clusters(
+    X: np.ndarray,
+    labels: Sequence[int],
+    centers: np.ndarray
+) -> Figure:
+    """
+    Scatter plot of clustered data with centroids.
+    Args:
+        X: 2D array of shape (n_samples, 2).
+        labels: Cluster labels for each sample.
+        centers: 2D array of cluster centroids.
+    Returns:
+        Matplotlib Figure with clusters and centroids plotted.
+    """
+    unique_labels = np.unique(labels)
+    n_clusters = unique_labels.size
+    # Choose a colormap
+    cmap = plt.get_cmap('tab10')
+    fig, ax = plt.subplots(figsize=(8, 6))
+    for idx, cluster in enumerate(unique_labels):
+        mask = labels == cluster
+        ax.scatter(
+            X[mask, 0], X[mask, 1],
+            s=50,
+            label=f"Cluster {cluster}",
+            color=cmap(idx),
+            edgecolor='k',
+            alpha=0.7
+        )
+    # Plot centroids
+    ax.scatter(
+        centers[:, 0], centers[:, 1],
+        s=200,
+        marker='X',
+        c='black',
+        label='Centroids',
+        linewidths=2
+    )
+    ax.set_title("Cluster Visualization", fontsize=14, fontweight="bold")
+    ax.set_xlabel('Annual Income ($K)', fontsize=14)
+    ax.set_xlabel('Spending Score', fontsize=14)
+    ax.legend(title="Clusters", fontsize=10, title_fontsize=12)
+    ax.grid(True, linestyle="--", alpha=0.6)
+    plt.tight_layout()
+    return fig