import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns # Import sklearn tools from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.neural_network import MLPClassifier from sklearn.metrics import confusion_matrix, classification_report # Set up page configuration and title st.set_page_config(page_title="Breast Cancer Classification App", layout="wide") st.title("Breast Cancer Classification Analysis") # Display a header image (ensure you have this image file) # st.image("breast_cancer_banner.jpg", caption="Breast Cancer Analysis", use_column_width=True) # About the app with st.expander("About this App"): st.markdown(""" **Overview:** This application demonstrates classification of the Breast Cancer dataset using several machine learning models. **Models included:** - Logistic Regression - Support Vector Machine (SVM) - Random Forest - Gradient Boosting - K-Nearest Neighbors (KNN) - MLP Neural Network **Features:** - Data preprocessing and scaling - Visualization of confusion matrices, performance reports, and detailed result discussions - Interactive model selection and performance comparison """) # Load the Breast Cancer dataset data = load_breast_cancer() df = pd.DataFrame(data.data, columns=data.feature_names) df['target'] = data.target # Display the raw dataset st.subheader("Dataset Overview") st.write(df.head()) # Split data and preprocess X = df.drop("target", axis=1) y = df["target"] # Scale features scaler = StandardScaler() X_scaled = scaler.fit_transform(X) # Sidebar: Allow the user to select test set size test_size = st.sidebar.slider("Test Set Size", 0.1, 0.5, 0.2, step=0.05) X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=test_size, random_state=42) # Dictionary of models models = { "Logistic Regression": LogisticRegression(max_iter=10000), "SVM": SVC(kernel='linear'), "Random Forest": RandomForestClassifier(n_estimators=100), "Gradient Boosting": GradientBoostingClassifier(), "KNN": KNeighborsClassifier(), "MLP Neural Network": MLPClassifier(max_iter=500) } # Sidebar: Model selection model_choice = st.sidebar.selectbox("Choose a model", list(models.keys())) selected_model = models[model_choice] # Train the selected model with st.spinner("Training model..."): selected_model.fit(X_train, y_train) y_pred = selected_model.predict(X_test) # Mapping labels for readability label_mapping = {0: "malignant", 1: "benign"} y_test_labels = [label_mapping[label] for label in y_test] y_pred_labels = [label_mapping[label] for label in y_pred] # Evaluate model performance cm = confusion_matrix(y_test_labels, y_pred_labels, labels=["malignant", "benign"]) cr = classification_report(y_test_labels, y_pred_labels, output_dict=True) # Display the confusion matrix with a smaller figure size st.subheader(f"Confusion Matrix: {model_choice}") fig, ax = plt.subplots(figsize=(4, 3)) # Further reduced size sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=ax, xticklabels=["malignant", "benign"], yticklabels=["malignant", "benign"]) ax.set_xlabel("Predicted") ax.set_ylabel("True") plt.tight_layout() # Adjusts the layout to fit within the figure area st.pyplot(fig) # Display classification report st.subheader(f"Classification Report: {model_choice}") cr_df = pd.DataFrame(cr).transpose() st.dataframe(cr_df) # Result and Discussion section st.subheader("Result and Discussion") if model_choice == "Logistic Regression": st.markdown(""" **Logistic Regression Discussion:** - **Performance:** The model shows robust performance with clear separation between classes. - **Strengths:** It is fast, interpretable, and performs well on linearly separable data. - **Weaknesses:** May underperform on non-linear boundaries and could be sensitive to outliers. """) elif model_choice == "SVM": st.markdown(""" **SVM Discussion:** - **Performance:** The linear SVM performs well for this dataset, handling high-dimensional data efficiently. - **Strengths:** Effective in cases where the number of features is greater than the number of samples. - **Weaknesses:** Tuning parameters (like the kernel) is crucial and can be computationally expensive. """) elif model_choice == "Random Forest": st.markdown(""" **Random Forest Discussion:** - **Performance:** Typically provides high accuracy and robust results due to ensemble learning. - **Strengths:** Handles non-linearity well and provides insights via feature importance. - **Weaknesses:** Can be less interpretable and may overfit if the trees are not properly tuned. """) elif model_choice == "Gradient Boosting": st.markdown(""" **Gradient Boosting Discussion:** - **Performance:** Demonstrates strong performance by sequentially improving on previous errors. - **Strengths:** Excellent for handling complex data patterns. - **Weaknesses:** Sensitive to overfitting if hyperparameters are not carefully optimized. """) elif model_choice == "KNN": st.markdown(""" **KNN Discussion:** - **Performance:** Simple yet effective for this dataset, though performance depends on the choice of 'k'. - **Strengths:** Easy to implement and understand. - **Weaknesses:** Computationally expensive for large datasets and sensitive to feature scaling. """) elif model_choice == "MLP Neural Network": st.markdown(""" **MLP Neural Network Discussion:** - **Performance:** Provides competitive accuracy with a flexible model that can capture non-linear relationships. - **Strengths:** Can learn complex patterns with enough training data. - **Weaknesses:** Requires careful tuning of hyperparameters and more computational resources compared to simpler models. """) else: st.markdown("No discussion available for the selected model.") # Optionally, provide a download button for the classification report st.download_button("Download Classification Report as CSV", cr_df.to_csv().encode('utf-8'), "classification_report.csv", "text/csv")