File size: 6,279 Bytes
6d8ca31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e2967f3
6d8ca31
 
 
 
 
e2967f3
 
 
 
 
 
 
 
 
6d8ca31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e2967f3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

st.title("Sales Trend Prediction using KMeans Clustering")

def load_data():
    return pd.read_csv("shopping_trends.csv")

df = load_data()

# Select relevant features for clustering
features = ['Gender', 'Item Purchased', 'Previous Purchases', 'Frequency of Purchases', 'Purchase Amount (USD)']
df_filtered = df[features].copy()

# Convert Frequency of Purchases to string
df_filtered['Frequency of Purchases'] = df_filtered['Frequency of Purchases'].astype(str)

# One-hot encode categorical features
categorical_features = ['Gender', 'Item Purchased', 'Frequency of Purchases']
numerical_features = ['Previous Purchases', 'Purchase Amount (USD)']

ohe = OneHotEncoder(drop='first', sparse_output=False)
encoded_cats = ohe.fit_transform(df_filtered[categorical_features])
categorical_df = pd.DataFrame(encoded_cats, columns=ohe.get_feature_names_out(categorical_features), index=df.index)

df_processed = pd.concat([df_filtered[numerical_features], categorical_df], axis=1)

# Standardizing the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_processed)

# KMeans Clustering
n_clusters = 3  # Set the number of clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df['Cluster'] = kmeans.fit_predict(X_scaled)

# Compute Clustering Metrics
silhouette = silhouette_score(X_scaled, df['Cluster'])
davies_bouldin = davies_bouldin_score(X_scaled, df['Cluster'])
calinski_harabasz = calinski_harabasz_score(X_scaled, df['Cluster'])

def predict_cluster(user_input):
    """Predicts the cluster for a new user input."""
    user_df = pd.DataFrame([user_input])
    user_df['Frequency of Purchases'] = user_df['Frequency of Purchases'].astype(str)
    user_cats = ohe.transform(user_df[categorical_features])
    user_processed = pd.concat([user_df[numerical_features], pd.DataFrame(user_cats, columns=ohe.get_feature_names_out(categorical_features))], axis=1)
    user_scaled = scaler.transform(user_processed)
    return kmeans.predict(user_scaled)[0]

# Create Tabs
tab1, tab2, tab3 = st.tabs(["Dataset & Metrics", "Visualization", "Sales Trend Prediction"])

with tab1:
    st.subheader("Dataset Preview")
    st.write(df.head())
    
    st.subheader("Clustering Metrics")
    st.write(f"Number of Clusters: {n_clusters}")
    st.write(f"Silhouette Score: {silhouette:.4f}")
    st.write(f"Davies-Bouldin Score: {davies_bouldin:.4f}")
    st.write(f"Calinski-Harabasz Score: {calinski_harabasz:.4f}")

with tab2:
    st.subheader("Data Visualizations")
    
    # Elbow Method Visualization
    distortions = []
    K_range = range(2, 11)
    for k in K_range:
        kmeans_tmp = KMeans(n_clusters=k, random_state=42)
        kmeans_tmp.fit(X_scaled)
        distortions.append(kmeans_tmp.inertia_)
    
    fig, ax = plt.subplots()
    ax.plot(K_range, distortions, marker='o')
    ax.set_xlabel('Number of Clusters')
    ax.set_ylabel('Distortion')
    ax.set_title('Elbow Method for Optimal K')
    st.pyplot(fig)
    
    # Cluster Distribution Plot
    fig, ax = plt.subplots()
    sns.countplot(x=df['Cluster'], palette='viridis', ax=ax)
    ax.set_xlabel('Cluster')
    ax.set_ylabel('Count')
    ax.set_title('Cluster Distribution')
    st.pyplot(fig)
    
    # Visualizations for Item Purchased distribution
    fig, ax = plt.subplots(figsize=(10, 5))
    sns.countplot(y=df['Item Purchased'], order=df['Item Purchased'].value_counts().index, hue=df['Item Purchased'], palette='viridis', ax=ax)
    ax.set_title("Overall Item Purchase Distribution")
    ax.set_xlabel("Count")
    ax.set_ylabel("Item Purchased")
    st.pyplot(fig)
    
    # Separate Correlation Matrices for Each Label
    labels = ['Gender', 'Frequency of Purchases', 'Item Purchased']
    for label in labels:
        df[f'{label}_Numeric'] = LabelEncoder().fit_transform(df[label])
        correlation_matrix = df[[f'{label}_Numeric', 'Purchase Amount (USD)', 'Previous Purchases']].corr()
        fig, ax = plt.subplots(figsize=(10, 6))
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=ax)
        st.subheader(f"Feature Correlation Matrix for {label}")
        st.pyplot(fig)

with tab3:
    st.subheader("Enter Customer Details")
    gender = st.selectbox("Gender", df['Gender'].unique())
    item_purchased = st.selectbox("Item Purchased", df['Item Purchased'].unique())
    previous_purchases = st.number_input("Previous Purchases", min_value=0, max_value=100, value=10)
    frequency_of_purchases = st.selectbox("Frequency of Purchases", df['Frequency of Purchases'].unique().astype(str))
    purchase_amount = st.number_input("Purchase Amount (USD)", min_value=1, max_value=500, value=50)

    if st.button("Predict Sales Trend"):
        user_input = {
            'Gender': gender,
            'Item Purchased': item_purchased,
            'Previous Purchases': previous_purchases,
            'Frequency of Purchases': frequency_of_purchases,
            'Purchase Amount (USD)': purchase_amount
        }
        
        predicted_cluster = predict_cluster(user_input)
        st.write(f"Predicted Cluster: {predicted_cluster}")
        
        st.subheader(f"Sales Trend Analysis for Cluster {predicted_cluster}")
        cluster_data = df[df['Cluster'] == predicted_cluster]
        
        # Visualization of top-selling items in the cluster
        fig, ax = plt.subplots(figsize=(10, 5))
        sns.countplot(y=cluster_data['Item Purchased'], order=cluster_data['Item Purchased'].value_counts().index, palette='viridis', ax=ax)
        ax.set_title("Top Selling Items in This Cluster")
        ax.set_xlabel("Count")
        ax.set_ylabel("Item Purchased")
        st.pyplot(fig)
        
        # Display average purchase amount trend
        avg_purchase_amount = cluster_data.groupby('Item Purchased')['Purchase Amount (USD)'].mean()
        st.write("### Average Purchase Amount for Items in This Cluster:")
        st.write(avg_purchase_amount)