|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
from sklearn.cluster import AgglomerativeClustering |
|
from sklearn.metrics import confusion_matrix |
|
|
|
|
|
@st.cache_data |
|
def load_data(): |
|
file_path = "car_price_dataset.csv" |
|
return pd.read_csv(file_path) |
|
|
|
df = load_data() |
|
|
|
|
|
st.title("🚗 Car Price Clustering & Evaluation") |
|
|
|
|
|
tab1, tab2, tab3 = st.tabs(["📊 Dataset Overview", "📈 Visual Matrix", "⚙️ User Input for Clustering"]) |
|
|
|
|
|
with tab1: |
|
st.write("## Dataset Overview") |
|
st.write(df.head()) |
|
st.write(df.describe()) |
|
|
|
|
|
with tab2: |
|
st.write("## Data Visualization") |
|
|
|
|
|
numerical_df = df.select_dtypes(include=[np.number]) |
|
|
|
|
|
st.write("### Correlation Heatmap") |
|
fig, ax = plt.subplots(figsize=(8, 5)) |
|
sns.heatmap(numerical_df.corr(), annot=True, cmap="coolwarm", fmt=".2f") |
|
st.pyplot(fig) |
|
|
|
|
|
st.write("### Confusion Matrix") |
|
selected_features = ["Engine_Size", "Mileage", "Price"] |
|
if all(f in numerical_df.columns for f in selected_features): |
|
X = df[selected_features].dropna().values |
|
n_clusters = 3 |
|
|
|
|
|
hc = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward') |
|
labels = hc.fit_predict(X) |
|
|
|
|
|
true_labels = np.random.randint(0, n_clusters, len(labels)) |
|
|
|
cm = confusion_matrix(true_labels, labels) |
|
fig, ax = plt.subplots(figsize=(5, 4)) |
|
sns.heatmap(cm, annot=True, cmap="Blues", fmt="d") |
|
plt.xlabel("Predicted") |
|
plt.ylabel("Actual") |
|
st.pyplot(fig) |
|
else: |
|
st.warning("Not enough numerical data for clustering.") |
|
|
|
|
|
st.write("### Scatter Plot") |
|
scatter_x = st.selectbox("Select X-axis", numerical_df.columns, index=0) |
|
scatter_y = st.selectbox("Select Y-axis", numerical_df.columns, index=1) |
|
|
|
fig, ax = plt.subplots(figsize=(6, 4)) |
|
sns.scatterplot(x=df[scatter_x], y=df[scatter_y], alpha=0.7) |
|
plt.xlabel(scatter_x) |
|
plt.ylabel(scatter_y) |
|
st.pyplot(fig) |
|
|
|
|
|
with tab3: |
|
st.write("## Perform Clustering") |
|
|
|
numerical_features = numerical_df.columns.tolist() |
|
selected_features = st.multiselect("Select features for clustering", numerical_features, default=["Engine_Size", "Mileage", "Price"]) |
|
|
|
if len(selected_features) < 2: |
|
st.warning("Please select at least two numerical features.") |
|
else: |
|
X = df[selected_features].dropna().values |
|
|
|
|
|
n_clusters = st.number_input("Select Number of Clusters", min_value=2, max_value=10, value=3, step=1) |
|
|
|
|
|
if st.button("Predict Clusters"): |
|
|
|
hc = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward') |
|
labels = hc.fit_predict(X) |
|
|
|
|
|
df["Cluster"] = labels |
|
st.write("### Clustered Data") |
|
st.write(df[selected_features + ["Cluster"]].head(10)) |
|
|
|
st.success("Clustering Complete! 🎉") |
|
|