File size: 3,482 Bytes
b944f72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import confusion_matrix

# Load dataset
@st.cache_data
def load_data():
    file_path = "car_price_dataset.csv"  # Ensure this file is in the same directory
    return pd.read_csv(file_path)

df = load_data()

# Streamlit App Title
st.title("🚗 Car Price Clustering & Evaluation")

# Creating Tabs
tab1, tab2, tab3 = st.tabs(["📊 Dataset Overview", "📈 Visual Matrix", "⚙️ User Input for Clustering"])

# --- TAB 1: Dataset Overview ---
with tab1:
    st.write("## Dataset Overview")
    st.write(df.head())
    st.write(df.describe())

# --- TAB 2: Visualization Matrix ---
with tab2:
    st.write("## Data Visualization")

    # Select numerical features
    numerical_df = df.select_dtypes(include=[np.number])

    # Correlation Heatmap
    st.write("### Correlation Heatmap")
    fig, ax = plt.subplots(figsize=(8, 5))
    sns.heatmap(numerical_df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
    st.pyplot(fig)

    # Confusion Matrix
    st.write("### Confusion Matrix")
    selected_features = ["Engine_Size", "Mileage", "Price"]
    if all(f in numerical_df.columns for f in selected_features):
        X = df[selected_features].dropna().values
        n_clusters = 3  # Default cluster count

        # Apply Hierarchical Clustering
        hc = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
        labels = hc.fit_predict(X)

        # Generate dummy "true labels" (for demonstration)
        true_labels = np.random.randint(0, n_clusters, len(labels))

        cm = confusion_matrix(true_labels, labels)
        fig, ax = plt.subplots(figsize=(5, 4))
        sns.heatmap(cm, annot=True, cmap="Blues", fmt="d")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        st.pyplot(fig)
    else:
        st.warning("Not enough numerical data for clustering.")

    # Scatter Plot
    st.write("### Scatter Plot")
    scatter_x = st.selectbox("Select X-axis", numerical_df.columns, index=0)
    scatter_y = st.selectbox("Select Y-axis", numerical_df.columns, index=1)

    fig, ax = plt.subplots(figsize=(6, 4))
    sns.scatterplot(x=df[scatter_x], y=df[scatter_y], alpha=0.7)
    plt.xlabel(scatter_x)
    plt.ylabel(scatter_y)
    st.pyplot(fig)

# --- TAB 3: User Input & Clustering ---
with tab3:
    st.write("## Perform Clustering")

    numerical_features = numerical_df.columns.tolist()
    selected_features = st.multiselect("Select features for clustering", numerical_features, default=["Engine_Size", "Mileage", "Price"])

    if len(selected_features) < 2:
        st.warning("Please select at least two numerical features.")
    else:
        X = df[selected_features].dropna().values  # Prepare data

        # Choose Number of Clusters (With + / - Buttons)
        n_clusters = st.number_input("Select Number of Clusters", min_value=2, max_value=10, value=3, step=1)

        # Predict Button
        if st.button("Predict Clusters"):
            # Apply Hierarchical Clustering
            hc = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
            labels = hc.fit_predict(X)

            # Display results
            df["Cluster"] = labels
            st.write("### Clustered Data")
            st.write(df[selected_features + ["Cluster"]].head(10))

            st.success("Clustering Complete! 🎉")