File size: 6,493 Bytes
7c2913f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Import sklearn tools
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report
# Set up page configuration and title
st.set_page_config(page_title="Breast Cancer Classification App", layout="wide")
st.title("Breast Cancer Classification Analysis")
# Display a header image (ensure you have this image file)
# st.image("breast_cancer_banner.jpg", caption="Breast Cancer Analysis", use_column_width=True)
# About the app
with st.expander("About this App"):
st.markdown("""
**Overview:** This application demonstrates classification of the Breast Cancer dataset using several machine learning models.
**Models included:**
- Logistic Regression
- Support Vector Machine (SVM)
- Random Forest
- Gradient Boosting
- K-Nearest Neighbors (KNN)
- MLP Neural Network
**Features:**
- Data preprocessing and scaling
- Visualization of confusion matrices, performance reports, and detailed result discussions
- Interactive model selection and performance comparison
""")
# Load the Breast Cancer dataset
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
# Display the raw dataset
st.subheader("Dataset Overview")
st.write(df.head())
# Split data and preprocess
X = df.drop("target", axis=1)
y = df["target"]
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Sidebar: Allow the user to select test set size
test_size = st.sidebar.slider("Test Set Size", 0.1, 0.5, 0.2, step=0.05)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=test_size, random_state=42)
# Dictionary of models
models = {
"Logistic Regression": LogisticRegression(max_iter=10000),
"SVM": SVC(kernel='linear'),
"Random Forest": RandomForestClassifier(n_estimators=100),
"Gradient Boosting": GradientBoostingClassifier(),
"KNN": KNeighborsClassifier(),
"MLP Neural Network": MLPClassifier(max_iter=500)
}
# Sidebar: Model selection
model_choice = st.sidebar.selectbox("Choose a model", list(models.keys()))
selected_model = models[model_choice]
# Train the selected model
with st.spinner("Training model..."):
selected_model.fit(X_train, y_train)
y_pred = selected_model.predict(X_test)
# Mapping labels for readability
label_mapping = {0: "malignant", 1: "benign"}
y_test_labels = [label_mapping[label] for label in y_test]
y_pred_labels = [label_mapping[label] for label in y_pred]
# Evaluate model performance
cm = confusion_matrix(y_test_labels, y_pred_labels, labels=["malignant", "benign"])
cr = classification_report(y_test_labels, y_pred_labels, output_dict=True)
# Display the confusion matrix with a smaller figure size
st.subheader(f"Confusion Matrix: {model_choice}")
fig, ax = plt.subplots(figsize=(4, 3)) # Further reduced size
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=ax,
xticklabels=["malignant", "benign"], yticklabels=["malignant", "benign"])
ax.set_xlabel("Predicted")
ax.set_ylabel("True")
plt.tight_layout() # Adjusts the layout to fit within the figure area
st.pyplot(fig)
# Display classification report
st.subheader(f"Classification Report: {model_choice}")
cr_df = pd.DataFrame(cr).transpose()
st.dataframe(cr_df)
# Result and Discussion section
st.subheader("Result and Discussion")
if model_choice == "Logistic Regression":
st.markdown("""
**Logistic Regression Discussion:**
- **Performance:** The model shows robust performance with clear separation between classes.
- **Strengths:** It is fast, interpretable, and performs well on linearly separable data.
- **Weaknesses:** May underperform on non-linear boundaries and could be sensitive to outliers.
""")
elif model_choice == "SVM":
st.markdown("""
**SVM Discussion:**
- **Performance:** The linear SVM performs well for this dataset, handling high-dimensional data efficiently.
- **Strengths:** Effective in cases where the number of features is greater than the number of samples.
- **Weaknesses:** Tuning parameters (like the kernel) is crucial and can be computationally expensive.
""")
elif model_choice == "Random Forest":
st.markdown("""
**Random Forest Discussion:**
- **Performance:** Typically provides high accuracy and robust results due to ensemble learning.
- **Strengths:** Handles non-linearity well and provides insights via feature importance.
- **Weaknesses:** Can be less interpretable and may overfit if the trees are not properly tuned.
""")
elif model_choice == "Gradient Boosting":
st.markdown("""
**Gradient Boosting Discussion:**
- **Performance:** Demonstrates strong performance by sequentially improving on previous errors.
- **Strengths:** Excellent for handling complex data patterns.
- **Weaknesses:** Sensitive to overfitting if hyperparameters are not carefully optimized.
""")
elif model_choice == "KNN":
st.markdown("""
**KNN Discussion:**
- **Performance:** Simple yet effective for this dataset, though performance depends on the choice of 'k'.
- **Strengths:** Easy to implement and understand.
- **Weaknesses:** Computationally expensive for large datasets and sensitive to feature scaling.
""")
elif model_choice == "MLP Neural Network":
st.markdown("""
**MLP Neural Network Discussion:**
- **Performance:** Provides competitive accuracy with a flexible model that can capture non-linear relationships.
- **Strengths:** Can learn complex patterns with enough training data.
- **Weaknesses:** Requires careful tuning of hyperparameters and more computational resources compared to simpler models.
""")
else:
st.markdown("No discussion available for the selected model.")
# Optionally, provide a download button for the classification report
st.download_button("Download Classification Report as CSV", cr_df.to_csv().encode('utf-8'), "classification_report.csv", "text/csv")
|