import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder from sklearn.pipeline import Pipeline from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score st.title("Sales Trend Prediction using KMeans Clustering") def load_data(): return pd.read_csv("shopping_trends.csv") df = load_data() # Select relevant features for clustering features = ['Gender', 'Item Purchased', 'Previous Purchases', 'Frequency of Purchases', 'Purchase Amount (USD)'] df_filtered = df[features].copy() # Convert Frequency of Purchases to string df_filtered['Frequency of Purchases'] = df_filtered['Frequency of Purchases'].astype(str) # One-hot encode categorical features categorical_features = ['Gender', 'Item Purchased', 'Frequency of Purchases'] numerical_features = ['Previous Purchases', 'Purchase Amount (USD)'] ohe = OneHotEncoder(drop='first', sparse_output=False) encoded_cats = ohe.fit_transform(df_filtered[categorical_features]) categorical_df = pd.DataFrame(encoded_cats, columns=ohe.get_feature_names_out(categorical_features), index=df.index) df_processed = pd.concat([df_filtered[numerical_features], categorical_df], axis=1) # Standardizing the data scaler = StandardScaler() X_scaled = scaler.fit_transform(df_processed) # KMeans Clustering n_clusters = 3 # Set the number of clusters kmeans = KMeans(n_clusters=n_clusters, random_state=42) df['Cluster'] = kmeans.fit_predict(X_scaled) # Compute Clustering Metrics silhouette = silhouette_score(X_scaled, df['Cluster']) davies_bouldin = davies_bouldin_score(X_scaled, df['Cluster']) calinski_harabasz = calinski_harabasz_score(X_scaled, df['Cluster']) def predict_cluster(user_input): """Predicts the cluster for a new user input.""" user_df = pd.DataFrame([user_input]) user_df['Frequency of Purchases'] = user_df['Frequency of Purchases'].astype(str) user_cats = ohe.transform(user_df[categorical_features]) user_processed = pd.concat([user_df[numerical_features], pd.DataFrame(user_cats, columns=ohe.get_feature_names_out(categorical_features))], axis=1) user_scaled = scaler.transform(user_processed) return kmeans.predict(user_scaled)[0] # Create Tabs tab1, tab2, tab3 = st.tabs(["Dataset & Metrics", "Visualization", "Sales Trend Prediction"]) with tab1: st.subheader("Dataset Preview") st.write(df.head()) st.subheader("Clustering Metrics") st.write(f"Number of Clusters: {n_clusters}") st.write(f"Silhouette Score: {silhouette:.4f}") st.write(f"Davies-Bouldin Score: {davies_bouldin:.4f}") st.write(f"Calinski-Harabasz Score: {calinski_harabasz:.4f}") with tab2: st.subheader("Data Visualizations") # Elbow Method Visualization distortions = [] K_range = range(2, 11) for k in K_range: kmeans_tmp = KMeans(n_clusters=k, random_state=42) kmeans_tmp.fit(X_scaled) distortions.append(kmeans_tmp.inertia_) fig, ax = plt.subplots() ax.plot(K_range, distortions, marker='o') ax.set_xlabel('Number of Clusters') ax.set_ylabel('Distortion') ax.set_title('Elbow Method for Optimal K') st.pyplot(fig) # Cluster Distribution Plot fig, ax = plt.subplots() sns.countplot(x=df['Cluster'], palette='viridis', ax=ax) ax.set_xlabel('Cluster') ax.set_ylabel('Count') ax.set_title('Cluster Distribution') st.pyplot(fig) # Visualizations for Item Purchased distribution fig, ax = plt.subplots(figsize=(10, 5)) sns.countplot(y=df['Item Purchased'], order=df['Item Purchased'].value_counts().index, hue=df['Item Purchased'], palette='viridis', ax=ax) ax.set_title("Overall Item Purchase Distribution") ax.set_xlabel("Count") ax.set_ylabel("Item Purchased") st.pyplot(fig) # Separate Correlation Matrices for Each Label labels = ['Gender', 'Frequency of Purchases', 'Item Purchased'] for label in labels: df[f'{label}_Numeric'] = LabelEncoder().fit_transform(df[label]) correlation_matrix = df[[f'{label}_Numeric', 'Purchase Amount (USD)', 'Previous Purchases']].corr() fig, ax = plt.subplots(figsize=(10, 6)) sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=ax) st.subheader(f"Feature Correlation Matrix for {label}") st.pyplot(fig) with tab3: st.subheader("Enter Customer Details") gender = st.selectbox("Gender", df['Gender'].unique()) item_purchased = st.selectbox("Item Purchased", df['Item Purchased'].unique()) previous_purchases = st.number_input("Previous Purchases", min_value=0, max_value=100, value=10) frequency_of_purchases = st.selectbox("Frequency of Purchases", df['Frequency of Purchases'].unique().astype(str)) purchase_amount = st.number_input("Purchase Amount (USD)", min_value=1, max_value=500, value=50) if st.button("Predict Sales Trend"): user_input = { 'Gender': gender, 'Item Purchased': item_purchased, 'Previous Purchases': previous_purchases, 'Frequency of Purchases': frequency_of_purchases, 'Purchase Amount (USD)': purchase_amount } predicted_cluster = predict_cluster(user_input) st.write(f"Predicted Cluster: {predicted_cluster}") st.subheader(f"Sales Trend Analysis for Cluster {predicted_cluster}") cluster_data = df[df['Cluster'] == predicted_cluster] # Visualization of top-selling items in the cluster fig, ax = plt.subplots(figsize=(10, 5)) sns.countplot(y=cluster_data['Item Purchased'], order=cluster_data['Item Purchased'].value_counts().index, palette='viridis', ax=ax) ax.set_title("Top Selling Items in This Cluster") ax.set_xlabel("Count") ax.set_ylabel("Item Purchased") st.pyplot(fig) # Display average purchase amount trend avg_purchase_amount = cluster_data.groupby('Item Purchased')['Purchase Amount (USD)'].mean() st.write("### Average Purchase Amount for Items in This Cluster:") st.write(avg_purchase_amount)