|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
from sklearn.cluster import KMeans |
|
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder |
|
from sklearn.pipeline import Pipeline |
|
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score |
|
|
|
st.title("Sales Trend Prediction using KMeans Clustering") |
|
|
|
def load_data(): |
|
return pd.read_csv("shopping_trends.csv") |
|
|
|
df = load_data() |
|
|
|
|
|
features = ['Gender', 'Item Purchased', 'Previous Purchases', 'Frequency of Purchases', 'Purchase Amount (USD)'] |
|
df_filtered = df[features].copy() |
|
|
|
|
|
df_filtered['Frequency of Purchases'] = df_filtered['Frequency of Purchases'].astype(str) |
|
|
|
|
|
categorical_features = ['Gender', 'Item Purchased', 'Frequency of Purchases'] |
|
numerical_features = ['Previous Purchases', 'Purchase Amount (USD)'] |
|
|
|
ohe = OneHotEncoder(drop='first', sparse_output=False) |
|
encoded_cats = ohe.fit_transform(df_filtered[categorical_features]) |
|
categorical_df = pd.DataFrame(encoded_cats, columns=ohe.get_feature_names_out(categorical_features), index=df.index) |
|
|
|
df_processed = pd.concat([df_filtered[numerical_features], categorical_df], axis=1) |
|
|
|
|
|
scaler = StandardScaler() |
|
X_scaled = scaler.fit_transform(df_processed) |
|
|
|
|
|
n_clusters = 3 |
|
kmeans = KMeans(n_clusters=n_clusters, random_state=42) |
|
df['Cluster'] = kmeans.fit_predict(X_scaled) |
|
|
|
|
|
silhouette = silhouette_score(X_scaled, df['Cluster']) |
|
davies_bouldin = davies_bouldin_score(X_scaled, df['Cluster']) |
|
calinski_harabasz = calinski_harabasz_score(X_scaled, df['Cluster']) |
|
|
|
def predict_cluster(user_input): |
|
"""Predicts the cluster for a new user input.""" |
|
user_df = pd.DataFrame([user_input]) |
|
user_df['Frequency of Purchases'] = user_df['Frequency of Purchases'].astype(str) |
|
user_cats = ohe.transform(user_df[categorical_features]) |
|
user_processed = pd.concat([user_df[numerical_features], pd.DataFrame(user_cats, columns=ohe.get_feature_names_out(categorical_features))], axis=1) |
|
user_scaled = scaler.transform(user_processed) |
|
return kmeans.predict(user_scaled)[0] |
|
|
|
|
|
tab1, tab2, tab3 = st.tabs(["Dataset & Metrics", "Visualization", "Sales Trend Prediction"]) |
|
|
|
with tab1: |
|
st.subheader("Dataset Preview") |
|
st.write(df.head()) |
|
|
|
st.subheader("Clustering Metrics") |
|
st.write(f"Number of Clusters: {n_clusters}") |
|
st.write(f"Silhouette Score: {silhouette:.4f}") |
|
st.write(f"Davies-Bouldin Score: {davies_bouldin:.4f}") |
|
st.write(f"Calinski-Harabasz Score: {calinski_harabasz:.4f}") |
|
|
|
with tab2: |
|
st.subheader("Data Visualizations") |
|
|
|
|
|
distortions = [] |
|
K_range = range(2, 11) |
|
for k in K_range: |
|
kmeans_tmp = KMeans(n_clusters=k, random_state=42) |
|
kmeans_tmp.fit(X_scaled) |
|
distortions.append(kmeans_tmp.inertia_) |
|
|
|
fig, ax = plt.subplots() |
|
ax.plot(K_range, distortions, marker='o') |
|
ax.set_xlabel('Number of Clusters') |
|
ax.set_ylabel('Distortion') |
|
ax.set_title('Elbow Method for Optimal K') |
|
st.pyplot(fig) |
|
|
|
|
|
fig, ax = plt.subplots() |
|
sns.countplot(x=df['Cluster'], palette='viridis', ax=ax) |
|
ax.set_xlabel('Cluster') |
|
ax.set_ylabel('Count') |
|
ax.set_title('Cluster Distribution') |
|
st.pyplot(fig) |
|
|
|
|
|
fig, ax = plt.subplots(figsize=(10, 5)) |
|
sns.countplot(y=df['Item Purchased'], order=df['Item Purchased'].value_counts().index, hue=df['Item Purchased'], palette='viridis', ax=ax) |
|
ax.set_title("Overall Item Purchase Distribution") |
|
ax.set_xlabel("Count") |
|
ax.set_ylabel("Item Purchased") |
|
st.pyplot(fig) |
|
|
|
|
|
labels = ['Gender', 'Frequency of Purchases', 'Item Purchased'] |
|
for label in labels: |
|
df[f'{label}_Numeric'] = LabelEncoder().fit_transform(df[label]) |
|
correlation_matrix = df[[f'{label}_Numeric', 'Purchase Amount (USD)', 'Previous Purchases']].corr() |
|
fig, ax = plt.subplots(figsize=(10, 6)) |
|
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=ax) |
|
st.subheader(f"Feature Correlation Matrix for {label}") |
|
st.pyplot(fig) |
|
|
|
with tab3: |
|
st.subheader("Enter Customer Details") |
|
gender = st.selectbox("Gender", df['Gender'].unique()) |
|
item_purchased = st.selectbox("Item Purchased", df['Item Purchased'].unique()) |
|
previous_purchases = st.number_input("Previous Purchases", min_value=0, max_value=100, value=10) |
|
frequency_of_purchases = st.selectbox("Frequency of Purchases", df['Frequency of Purchases'].unique().astype(str)) |
|
purchase_amount = st.number_input("Purchase Amount (USD)", min_value=1, max_value=500, value=50) |
|
|
|
if st.button("Predict Sales Trend"): |
|
user_input = { |
|
'Gender': gender, |
|
'Item Purchased': item_purchased, |
|
'Previous Purchases': previous_purchases, |
|
'Frequency of Purchases': frequency_of_purchases, |
|
'Purchase Amount (USD)': purchase_amount |
|
} |
|
|
|
predicted_cluster = predict_cluster(user_input) |
|
st.write(f"Predicted Cluster: {predicted_cluster}") |
|
|
|
st.subheader(f"Sales Trend Analysis for Cluster {predicted_cluster}") |
|
cluster_data = df[df['Cluster'] == predicted_cluster] |
|
|
|
|
|
fig, ax = plt.subplots(figsize=(10, 5)) |
|
sns.countplot(y=cluster_data['Item Purchased'], order=cluster_data['Item Purchased'].value_counts().index, palette='viridis', ax=ax) |
|
ax.set_title("Top Selling Items in This Cluster") |
|
ax.set_xlabel("Count") |
|
ax.set_ylabel("Item Purchased") |
|
st.pyplot(fig) |
|
|
|
|
|
avg_purchase_amount = cluster_data.groupby('Item Purchased')['Purchase Amount (USD)'].mean() |
|
st.write("### Average Purchase Amount for Items in This Cluster:") |
|
st.write(avg_purchase_amount) |
|
|