SURESHBEEKHANI commited on
Commit
874b2d8
·
verified ·
1 Parent(s): dfe0194

Upload 6 files

Browse files
app.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import matplotlib.pyplot as plt
3
+ from src.clustering import load_data, extract_features, fit_kmeans, calculate_wcss
4
+ from src.utils import plot_cluster_counts, visualize_clusters
5
+ from typing import List
6
+
7
+ # Page configuration
8
+ st.set_page_config(
9
+ page_title="Customer Segmentation",
10
+ layout="wide",
11
+ initial_sidebar_state="expanded"
12
+ )
13
+
14
+ # Sidebar styling via markdown (optional)
15
+ st.markdown(
16
+ """
17
+ <style>
18
+ .reportview-container { padding: 2rem; }
19
+ .sidebar .sidebar-content { background-color: #ffffff; padding: 1.5rem; border-radius: 8px; }
20
+ </style>
21
+ """,
22
+ unsafe_allow_html=True
23
+ )
24
+
25
+ # App title
26
+ st.title("📊 Customer Segmentation")
27
+
28
+ # File upload
29
+ uploaded_file = st.sidebar.file_uploader("Upload CSV file", type=["csv"])
30
+ if not uploaded_file:
31
+ st.sidebar.info("Please upload a CSV file to proceed.")
32
+ st.stop()
33
+
34
+ # Load data
35
+ data = load_data(uploaded_file)
36
+ st.subheader("Dataset Preview")
37
+ st.dataframe(data.head())
38
+
39
+ # Select features
40
+ feature_options: List[str] = list(data.columns)
41
+ selected_features = st.sidebar.multiselect(
42
+ "Select two features for clustering:",
43
+ options=feature_options,
44
+ default=feature_options[3:5]
45
+ )
46
+ if len(selected_features) != 2:
47
+ st.sidebar.error("Please select exactly two features.")
48
+ st.stop()
49
+
50
+ # Clustering settings
51
+ n_clusters = st.sidebar.slider(
52
+ "Number of clusters", min_value=2, max_value=10, value=5
53
+ )
54
+
55
+ # Run clustering
56
+ if st.sidebar.button("Run Clustering"):
57
+ # Extract features
58
+ X = extract_features(data, selected_features)
59
+
60
+ # Compute elbow
61
+ wcss = calculate_wcss(X, max_clusters=10)
62
+ fig_elbow, ax = plt.subplots(figsize=(8, 4))
63
+ ax.plot(range(1, len(wcss) + 1), wcss, marker='o')
64
+ ax.set_title("Elbow Method: WCSS vs. Number of Clusters", fontsize=14)
65
+ ax.set_xlabel("Number of Clusters", fontsize=12)
66
+ ax.set_ylabel("WCSS", fontsize=12)
67
+ ax.grid(True, linestyle="--", alpha=0.6)
68
+ st.subheader("Elbow Method")
69
+ st.pyplot(fig_elbow)
70
+
71
+ # Fit KMeans
72
+ labels, centers = fit_kmeans(X, n_clusters)
73
+ data['Cluster'] = labels
74
+
75
+ # Cluster visualization
76
+ st.subheader("Cluster Plot")
77
+ fig_clusters = visualize_clusters(X, labels, centers)
78
+ st.pyplot(fig_clusters)
79
+
80
+ # Cluster counts
81
+ st.subheader("Cluster Size Distribution")
82
+ fig_counts = plot_cluster_counts(labels)
83
+ st.pyplot(fig_counts)
84
+
85
+ st.success("Clustering completed!")
notebook/Customer_Segmentation_using_K_Means_Clustering.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ numpy
4
+ matplotlib
5
+ seaborn
6
+ scikit-learn
src/__pycache__/utils.cpython-310.pyc ADDED
Binary file (2.41 kB). View file
 
src/clustering.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.cluster import KMeans
4
+ from typing import Tuple, List
5
+
6
+
7
+ def load_data(filepath: str) -> pd.DataFrame:
8
+ """
9
+ Load dataset from a CSV file.
10
+
11
+ Args:
12
+ filepath: Path to the CSV file.
13
+ Returns:
14
+ Pandas DataFrame.
15
+ """
16
+ return pd.read_csv(filepath)
17
+
18
+
19
+ def extract_features(df: pd.DataFrame, feature_cols: List[str]) -> np.ndarray:
20
+ """
21
+ Extract numeric feature matrix from DataFrame.
22
+
23
+ Args:
24
+ df: Input DataFrame.
25
+ feature_cols: List of column names to use as features.
26
+ Returns:
27
+ 2D NumPy array of features.
28
+ """
29
+ return df[feature_cols].to_numpy()
30
+
31
+
32
+ def fit_kmeans(
33
+ X: np.ndarray,
34
+ n_clusters: int,
35
+ random_state: int = 42
36
+ ) -> Tuple[np.ndarray, np.ndarray]:
37
+ """
38
+ Fit KMeans and return labels and centroids.
39
+
40
+ Args:
41
+ X: Feature matrix.
42
+ n_clusters: Number of clusters.
43
+ random_state: Random seed for reproducibility.
44
+ Returns:
45
+ Tuple of (labels array, centers array).
46
+ """
47
+ kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
48
+ labels = kmeans.fit_predict(X)
49
+ return labels, kmeans.cluster_centers_
50
+
51
+
52
+ def calculate_wcss(
53
+ X: np.ndarray,
54
+ max_clusters: int = 10
55
+ ) -> List[float]:
56
+ """
57
+ Compute within-cluster sum of squares for 1..max_clusters.
58
+
59
+ Args:
60
+ X: Feature matrix.
61
+ max_clusters: Maximum number of clusters to evaluate.
62
+ Returns:
63
+ List of inertia values.
64
+ """
65
+ wcss = []
66
+ for k in range(1, max_clusters + 1):
67
+ kmeans = KMeans(n_clusters=k, random_state=42)
68
+ kmeans.fit(X)
69
+ wcss.append(kmeans.inertia_)
70
+ return wcss
src/utils.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib.pyplot as plt
2
+ import numpy as np
3
+ import pandas as pd
4
+ from collections.abc import Sequence
5
+ from matplotlib.figure import Figure
6
+
7
+
8
+ def plot_cluster_counts(labels: Sequence[int]) -> Figure:
9
+ """
10
+ Generate a bar chart showing the number of samples in each cluster.
11
+
12
+ Args:
13
+ labels: Sequence of integer cluster labels.
14
+ Returns:
15
+ Matplotlib Figure with cluster size distribution.
16
+ """
17
+ # Count and sort cluster sizes
18
+ counts = pd.Series(labels).value_counts().sort_index()
19
+
20
+ # Create bar chart
21
+ fig, ax = plt.subplots(figsize=(8, 5))
22
+ ax.bar(counts.index.astype(str), counts.values, edgecolor="black")
23
+ ax.set_title("Cluster Size Distribution", fontsize=14, fontweight="bold")
24
+ ax.set_xlabel("Cluster Label", fontsize=12)
25
+ ax.set_ylabel("Number of Samples", fontsize=12)
26
+ ax.grid(axis="y", linestyle="--", alpha=0.6)
27
+ plt.tight_layout()
28
+ return fig
29
+
30
+
31
+ def visualize_clusters(
32
+ X: np.ndarray,
33
+ labels: Sequence[int],
34
+ centers: np.ndarray
35
+ ) -> Figure:
36
+ """
37
+ Scatter plot of clustered data with centroids.
38
+
39
+ Args:
40
+ X: 2D array of shape (n_samples, 2).
41
+ labels: Cluster labels for each sample.
42
+ centers: 2D array of cluster centroids.
43
+ Returns:
44
+ Matplotlib Figure with clusters and centroids plotted.
45
+ """
46
+ unique_labels = np.unique(labels)
47
+ n_clusters = unique_labels.size
48
+
49
+ # Choose a colormap
50
+ cmap = plt.get_cmap('tab10')
51
+
52
+ fig, ax = plt.subplots(figsize=(8, 6))
53
+ for idx, cluster in enumerate(unique_labels):
54
+ mask = labels == cluster
55
+ ax.scatter(
56
+ X[mask, 0], X[mask, 1],
57
+ s=50,
58
+ label=f"Cluster {cluster}",
59
+ color=cmap(idx),
60
+ edgecolor='k',
61
+ alpha=0.7
62
+ )
63
+
64
+ # Plot centroids
65
+ ax.scatter(
66
+ centers[:, 0], centers[:, 1],
67
+ s=200,
68
+ marker='X',
69
+ c='black',
70
+ label='Centroids',
71
+ linewidths=2
72
+ )
73
+
74
+ ax.set_title("Cluster Visualization", fontsize=14, fontweight="bold")
75
+ ax.set_xlabel('Annual Income ($K)', fontsize=14)
76
+ ax.set_xlabel('Spending Score', fontsize=14)
77
+ ax.legend(title="Clusters", fontsize=10, title_fontsize=12)
78
+ ax.grid(True, linestyle="--", alpha=0.6)
79
+ plt.tight_layout()
80
+ return fig