File size: 3,482 Bytes
b944f72 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import confusion_matrix
# Load dataset
@st.cache_data
def load_data():
file_path = "car_price_dataset.csv" # Ensure this file is in the same directory
return pd.read_csv(file_path)
df = load_data()
# Streamlit App Title
st.title("🚗 Car Price Clustering & Evaluation")
# Creating Tabs
tab1, tab2, tab3 = st.tabs(["📊 Dataset Overview", "📈 Visual Matrix", "⚙️ User Input for Clustering"])
# --- TAB 1: Dataset Overview ---
with tab1:
st.write("## Dataset Overview")
st.write(df.head())
st.write(df.describe())
# --- TAB 2: Visualization Matrix ---
with tab2:
st.write("## Data Visualization")
# Select numerical features
numerical_df = df.select_dtypes(include=[np.number])
# Correlation Heatmap
st.write("### Correlation Heatmap")
fig, ax = plt.subplots(figsize=(8, 5))
sns.heatmap(numerical_df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
st.pyplot(fig)
# Confusion Matrix
st.write("### Confusion Matrix")
selected_features = ["Engine_Size", "Mileage", "Price"]
if all(f in numerical_df.columns for f in selected_features):
X = df[selected_features].dropna().values
n_clusters = 3 # Default cluster count
# Apply Hierarchical Clustering
hc = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
labels = hc.fit_predict(X)
# Generate dummy "true labels" (for demonstration)
true_labels = np.random.randint(0, n_clusters, len(labels))
cm = confusion_matrix(true_labels, labels)
fig, ax = plt.subplots(figsize=(5, 4))
sns.heatmap(cm, annot=True, cmap="Blues", fmt="d")
plt.xlabel("Predicted")
plt.ylabel("Actual")
st.pyplot(fig)
else:
st.warning("Not enough numerical data for clustering.")
# Scatter Plot
st.write("### Scatter Plot")
scatter_x = st.selectbox("Select X-axis", numerical_df.columns, index=0)
scatter_y = st.selectbox("Select Y-axis", numerical_df.columns, index=1)
fig, ax = plt.subplots(figsize=(6, 4))
sns.scatterplot(x=df[scatter_x], y=df[scatter_y], alpha=0.7)
plt.xlabel(scatter_x)
plt.ylabel(scatter_y)
st.pyplot(fig)
# --- TAB 3: User Input & Clustering ---
with tab3:
st.write("## Perform Clustering")
numerical_features = numerical_df.columns.tolist()
selected_features = st.multiselect("Select features for clustering", numerical_features, default=["Engine_Size", "Mileage", "Price"])
if len(selected_features) < 2:
st.warning("Please select at least two numerical features.")
else:
X = df[selected_features].dropna().values # Prepare data
# Choose Number of Clusters (With + / - Buttons)
n_clusters = st.number_input("Select Number of Clusters", min_value=2, max_value=10, value=3, step=1)
# Predict Button
if st.button("Predict Clusters"):
# Apply Hierarchical Clustering
hc = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
labels = hc.fit_predict(X)
# Display results
df["Cluster"] = labels
st.write("### Clustered Data")
st.write(df[selected_features + ["Cluster"]].head(10))
st.success("Clustering Complete! 🎉")
|