3v324v23's picture
changes
b944f72
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import confusion_matrix
# Load dataset
@st.cache_data
def load_data():
file_path = "car_price_dataset.csv" # Ensure this file is in the same directory
return pd.read_csv(file_path)
df = load_data()
# Streamlit App Title
st.title("🚗 Car Price Clustering & Evaluation")
# Creating Tabs
tab1, tab2, tab3 = st.tabs(["📊 Dataset Overview", "📈 Visual Matrix", "⚙️ User Input for Clustering"])
# --- TAB 1: Dataset Overview ---
with tab1:
st.write("## Dataset Overview")
st.write(df.head())
st.write(df.describe())
# --- TAB 2: Visualization Matrix ---
with tab2:
st.write("## Data Visualization")
# Select numerical features
numerical_df = df.select_dtypes(include=[np.number])
# Correlation Heatmap
st.write("### Correlation Heatmap")
fig, ax = plt.subplots(figsize=(8, 5))
sns.heatmap(numerical_df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
st.pyplot(fig)
# Confusion Matrix
st.write("### Confusion Matrix")
selected_features = ["Engine_Size", "Mileage", "Price"]
if all(f in numerical_df.columns for f in selected_features):
X = df[selected_features].dropna().values
n_clusters = 3 # Default cluster count
# Apply Hierarchical Clustering
hc = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
labels = hc.fit_predict(X)
# Generate dummy "true labels" (for demonstration)
true_labels = np.random.randint(0, n_clusters, len(labels))
cm = confusion_matrix(true_labels, labels)
fig, ax = plt.subplots(figsize=(5, 4))
sns.heatmap(cm, annot=True, cmap="Blues", fmt="d")
plt.xlabel("Predicted")
plt.ylabel("Actual")
st.pyplot(fig)
else:
st.warning("Not enough numerical data for clustering.")
# Scatter Plot
st.write("### Scatter Plot")
scatter_x = st.selectbox("Select X-axis", numerical_df.columns, index=0)
scatter_y = st.selectbox("Select Y-axis", numerical_df.columns, index=1)
fig, ax = plt.subplots(figsize=(6, 4))
sns.scatterplot(x=df[scatter_x], y=df[scatter_y], alpha=0.7)
plt.xlabel(scatter_x)
plt.ylabel(scatter_y)
st.pyplot(fig)
# --- TAB 3: User Input & Clustering ---
with tab3:
st.write("## Perform Clustering")
numerical_features = numerical_df.columns.tolist()
selected_features = st.multiselect("Select features for clustering", numerical_features, default=["Engine_Size", "Mileage", "Price"])
if len(selected_features) < 2:
st.warning("Please select at least two numerical features.")
else:
X = df[selected_features].dropna().values # Prepare data
# Choose Number of Clusters (With + / - Buttons)
n_clusters = st.number_input("Select Number of Clusters", min_value=2, max_value=10, value=3, step=1)
# Predict Button
if st.button("Predict Clusters"):
# Apply Hierarchical Clustering
hc = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
labels = hc.fit_predict(X)
# Display results
df["Cluster"] = labels
st.write("### Clustered Data")
st.write(df[selected_features + ["Cluster"]].head(10))
st.success("Clustering Complete! 🎉")