import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.cluster import AgglomerativeClustering from sklearn.metrics import confusion_matrix # Load dataset @st.cache_data def load_data(): file_path = "car_price_dataset.csv" # Ensure this file is in the same directory return pd.read_csv(file_path) df = load_data() # Streamlit App Title st.title("🚗 Car Price Clustering & Evaluation") # Creating Tabs tab1, tab2, tab3 = st.tabs(["📊 Dataset Overview", "📈 Visual Matrix", "⚙️ User Input for Clustering"]) # --- TAB 1: Dataset Overview --- with tab1: st.write("## Dataset Overview") st.write(df.head()) st.write(df.describe()) # --- TAB 2: Visualization Matrix --- with tab2: st.write("## Data Visualization") # Select numerical features numerical_df = df.select_dtypes(include=[np.number]) # Correlation Heatmap st.write("### Correlation Heatmap") fig, ax = plt.subplots(figsize=(8, 5)) sns.heatmap(numerical_df.corr(), annot=True, cmap="coolwarm", fmt=".2f") st.pyplot(fig) # Confusion Matrix st.write("### Confusion Matrix") selected_features = ["Engine_Size", "Mileage", "Price"] if all(f in numerical_df.columns for f in selected_features): X = df[selected_features].dropna().values n_clusters = 3 # Default cluster count # Apply Hierarchical Clustering hc = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward') labels = hc.fit_predict(X) # Generate dummy "true labels" (for demonstration) true_labels = np.random.randint(0, n_clusters, len(labels)) cm = confusion_matrix(true_labels, labels) fig, ax = plt.subplots(figsize=(5, 4)) sns.heatmap(cm, annot=True, cmap="Blues", fmt="d") plt.xlabel("Predicted") plt.ylabel("Actual") st.pyplot(fig) else: st.warning("Not enough numerical data for clustering.") # Scatter Plot st.write("### Scatter Plot") scatter_x = st.selectbox("Select X-axis", numerical_df.columns, index=0) scatter_y = st.selectbox("Select Y-axis", numerical_df.columns, index=1) fig, ax = plt.subplots(figsize=(6, 4)) sns.scatterplot(x=df[scatter_x], y=df[scatter_y], alpha=0.7) plt.xlabel(scatter_x) plt.ylabel(scatter_y) st.pyplot(fig) # --- TAB 3: User Input & Clustering --- with tab3: st.write("## Perform Clustering") numerical_features = numerical_df.columns.tolist() selected_features = st.multiselect("Select features for clustering", numerical_features, default=["Engine_Size", "Mileage", "Price"]) if len(selected_features) < 2: st.warning("Please select at least two numerical features.") else: X = df[selected_features].dropna().values # Prepare data # Choose Number of Clusters (With + / - Buttons) n_clusters = st.number_input("Select Number of Clusters", min_value=2, max_value=10, value=3, step=1) # Predict Button if st.button("Predict Clusters"): # Apply Hierarchical Clustering hc = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward') labels = hc.fit_predict(X) # Display results df["Cluster"] = labels st.write("### Clustered Data") st.write(df[selected_features + ["Cluster"]].head(10)) st.success("Clustering Complete! 🎉")