|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
|
|
|
|
from sklearn.datasets import load_breast_cancer |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.preprocessing import StandardScaler |
|
from sklearn.linear_model import LogisticRegression |
|
from sklearn.svm import SVC |
|
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier |
|
from sklearn.neighbors import KNeighborsClassifier |
|
from sklearn.neural_network import MLPClassifier |
|
from sklearn.metrics import confusion_matrix, classification_report |
|
|
|
|
|
st.set_page_config(page_title="Breast Cancer Classification App", layout="wide") |
|
st.title("Breast Cancer Classification Analysis") |
|
|
|
|
|
|
|
|
|
|
|
with st.expander("About this App"): |
|
st.markdown(""" |
|
**Overview:** This application demonstrates classification of the Breast Cancer dataset using several machine learning models. |
|
|
|
**Models included:** |
|
- Logistic Regression |
|
- Support Vector Machine (SVM) |
|
- Random Forest |
|
- Gradient Boosting |
|
- K-Nearest Neighbors (KNN) |
|
- MLP Neural Network |
|
|
|
**Features:** |
|
- Data preprocessing and scaling |
|
- Visualization of confusion matrices, performance reports, and detailed result discussions |
|
- Interactive model selection and performance comparison |
|
""") |
|
|
|
|
|
data = load_breast_cancer() |
|
df = pd.DataFrame(data.data, columns=data.feature_names) |
|
df['target'] = data.target |
|
|
|
|
|
st.subheader("Dataset Overview") |
|
st.write(df.head()) |
|
|
|
|
|
X = df.drop("target", axis=1) |
|
y = df["target"] |
|
|
|
|
|
scaler = StandardScaler() |
|
X_scaled = scaler.fit_transform(X) |
|
|
|
|
|
test_size = st.sidebar.slider("Test Set Size", 0.1, 0.5, 0.2, step=0.05) |
|
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=test_size, random_state=42) |
|
|
|
|
|
models = { |
|
"Logistic Regression": LogisticRegression(max_iter=10000), |
|
"SVM": SVC(kernel='linear'), |
|
"Random Forest": RandomForestClassifier(n_estimators=100), |
|
"Gradient Boosting": GradientBoostingClassifier(), |
|
"KNN": KNeighborsClassifier(), |
|
"MLP Neural Network": MLPClassifier(max_iter=500) |
|
} |
|
|
|
|
|
model_choice = st.sidebar.selectbox("Choose a model", list(models.keys())) |
|
selected_model = models[model_choice] |
|
|
|
|
|
with st.spinner("Training model..."): |
|
selected_model.fit(X_train, y_train) |
|
y_pred = selected_model.predict(X_test) |
|
|
|
|
|
label_mapping = {0: "malignant", 1: "benign"} |
|
y_test_labels = [label_mapping[label] for label in y_test] |
|
y_pred_labels = [label_mapping[label] for label in y_pred] |
|
|
|
|
|
cm = confusion_matrix(y_test_labels, y_pred_labels, labels=["malignant", "benign"]) |
|
cr = classification_report(y_test_labels, y_pred_labels, output_dict=True) |
|
|
|
st.subheader(f"Confusion Matrix: {model_choice}") |
|
fig, ax = plt.subplots(figsize=(4, 3)) |
|
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=ax, |
|
xticklabels=["malignant", "benign"], yticklabels=["malignant", "benign"]) |
|
ax.set_xlabel("Predicted") |
|
ax.set_ylabel("True") |
|
plt.tight_layout() |
|
st.pyplot(fig) |
|
|
|
|
|
|
|
st.subheader(f"Classification Report: {model_choice}") |
|
cr_df = pd.DataFrame(cr).transpose() |
|
st.dataframe(cr_df) |
|
|
|
|
|
st.subheader("Result and Discussion") |
|
if model_choice == "Logistic Regression": |
|
st.markdown(""" |
|
**Logistic Regression Discussion:** |
|
- **Performance:** The model shows robust performance with clear separation between classes. |
|
- **Strengths:** It is fast, interpretable, and performs well on linearly separable data. |
|
- **Weaknesses:** May underperform on non-linear boundaries and could be sensitive to outliers. |
|
""") |
|
elif model_choice == "SVM": |
|
st.markdown(""" |
|
**SVM Discussion:** |
|
- **Performance:** The linear SVM performs well for this dataset, handling high-dimensional data efficiently. |
|
- **Strengths:** Effective in cases where the number of features is greater than the number of samples. |
|
- **Weaknesses:** Tuning parameters (like the kernel) is crucial and can be computationally expensive. |
|
""") |
|
elif model_choice == "Random Forest": |
|
st.markdown(""" |
|
**Random Forest Discussion:** |
|
- **Performance:** Typically provides high accuracy and robust results due to ensemble learning. |
|
- **Strengths:** Handles non-linearity well and provides insights via feature importance. |
|
- **Weaknesses:** Can be less interpretable and may overfit if the trees are not properly tuned. |
|
""") |
|
elif model_choice == "Gradient Boosting": |
|
st.markdown(""" |
|
**Gradient Boosting Discussion:** |
|
- **Performance:** Demonstrates strong performance by sequentially improving on previous errors. |
|
- **Strengths:** Excellent for handling complex data patterns. |
|
- **Weaknesses:** Sensitive to overfitting if hyperparameters are not carefully optimized. |
|
""") |
|
elif model_choice == "KNN": |
|
st.markdown(""" |
|
**KNN Discussion:** |
|
- **Performance:** Simple yet effective for this dataset, though performance depends on the choice of 'k'. |
|
- **Strengths:** Easy to implement and understand. |
|
- **Weaknesses:** Computationally expensive for large datasets and sensitive to feature scaling. |
|
""") |
|
elif model_choice == "MLP Neural Network": |
|
st.markdown(""" |
|
**MLP Neural Network Discussion:** |
|
- **Performance:** Provides competitive accuracy with a flexible model that can capture non-linear relationships. |
|
- **Strengths:** Can learn complex patterns with enough training data. |
|
- **Weaknesses:** Requires careful tuning of hyperparameters and more computational resources compared to simpler models. |
|
""") |
|
else: |
|
st.markdown("No discussion available for the selected model.") |
|
|
|
|
|
st.download_button("Download Classification Report as CSV", cr_df.to_csv().encode('utf-8'), "classification_report.csv", "text/csv") |
|
|