Spaces:

louiecerv
/

svm_kernel_comparison

Sleeping

File size: 6,869 Bytes

import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report

# Function to visualize decision boundary
def visualize_classifier(classifier, X, y, title=''):
    min_x, max_x = X[:, 0].min() - 1.0, X[:, 0].max() + 1.0
    min_y, max_y = X[:, 1].min() - 1.0, X[:, 1].max() + 1.0
    mesh_step_size = 0.01
    x_vals, y_vals = np.meshgrid(np.arange(min_x, max_x, mesh_step_size), 
                                  np.arange(min_y, max_y, mesh_step_size))
    output = classifier.predict(np.c_[x_vals.ravel(), y_vals.ravel()])
    output = output.reshape(x_vals.shape)
    fig, ax = plt.subplots()
    ax.set_title(title)
    ax.pcolormesh(x_vals, y_vals, output, cmap=plt.cm.gray, shading='auto')
    ax.scatter(X[:, 0], X[:, 1], c=y, s=75, edgecolors='black', linewidth=1, cmap=plt.cm.Paired)
    ax.set_xlim(x_vals.min(), x_vals.max())
    ax.set_ylim(y_vals.min(), y_vals.max())
    ax.set_xticks(np.arange(int(X[:, 0].min() - 1), int(X[:, 0].max() + 1), 1.0))
    ax.set_yticks(np.arange(int(X[:, 1].min() - 1), int(X[:, 1].max() + 1), 1.0))
    st.pyplot(fig)

def main():
    # Load the dataset
    st.title("SVM Kernel Performance Comparison")

    about = """
    # 🧠 SVM Kernel Comparison: Understanding the Impact on Overlapped Data  

    In machine learning, **Support Vector Machines (SVMs)** are powerful classifiers that work well for both linear and non-linear decision boundaries. However, the performance of an SVM heavily depends on the **choice of kernel function**. Let's analyze how different kernels handle **overlapped data** and why choosing the right kernel is crucial.  

    ## 🔍 Kernel Performance Breakdown  

    ### 1️⃣ **Linear Kernel** 🟢  
    - 📏 Assumes the data is **linearly separable** (i.e., can be divided by a straight line).  
    - ✅ Works well when classes are well-separated.  
    - ❌ Struggles with highly overlapped data, leading to **poor generalization**.  
    - 🚀 **Best for:** High-dimensional sparse data (e.g., text classification).  

    ### 2️⃣ **Polynomial Kernel** 📈  
    - 🔄 Expands feature space by computing polynomial combinations of features.  
    - ✅ Can model more complex decision boundaries.  
    - ❌ **High-degree polynomials** can lead to **overfitting**.  
    - 🚀 **Best for:** Medium-complexity patterns where interactions between features matter.  

    ### 3️⃣ **Radial Basis Function (RBF) Kernel** 🔵  
    - 🔥 Uses **Gaussian similarity** to map data into a higher-dimensional space.  
    - ✅ Excels in handling **highly non-linear** and **overlapped** data.  
    - ❌ Requires careful tuning of the **gamma** parameter to avoid underfitting or overfitting.  
    - 🚀 **Best for:** Complex, non-linear relationships (e.g., image classification).  

    ## 🎯 Choosing the Right Kernel  
    - If data is **linearly separable**, a **linear kernel** is efficient and interpretable.  
    - If data has **moderate overlap**, a **polynomial kernel** provides flexibility.  
    - If data is **highly overlapped and non-linear**, the **RBF kernel** is often the best choice.  

    ### 🤖 Key Takeaway  
    The **right kernel choice** significantly impacts classification accuracy. While RBF is a strong default for **complex overlapped data**, simpler kernels should be preferred when appropriate to reduce computation cost and improve interpretability. **Experimentation and hyperparameter tuning are essential** to achieving the best results.  

    🔎 *“There is no one-size-fits-all kernel – understanding your data is the key to unlocking SVM’s full potential!”* 
    
    🚀 Created by: Louie F.Cervantes, M.Eng. (Information Engineering)
    """
    with st.expander("About SVM Kernels"):
        st.markdown(about)

    uploaded_file = './data/overlapped.csv'
    if uploaded_file:
        df = pd.read_csv(uploaded_file)
        st.write("### Data Preview")
        st.dataframe(df)
        
        # Assuming the last column is the target
        X = df.iloc[:, :-1]
        y = df.iloc[:, -1]
        
        # Splitting dataset
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
        
        # Plot overlapped clusters
        st.write("### Cluster Visualization")
        fig, ax = plt.subplots()
        scatter = sns.scatterplot(x=X.iloc[:, 0], y=X.iloc[:, 1], hue=y, palette='coolwarm', alpha=0.6)
        plt.xlabel("Feature 1")
        plt.ylabel("Feature 2")
        plt.title("Overlapped Clusters")
        st.pyplot(fig)
        
        # Function to train SVM and get performance metrics
        def evaluate_svm(kernel_type):
            model = SVC(kernel=kernel_type)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            cm = confusion_matrix(y_test, y_pred)
            cr = classification_report(y_test, y_pred, output_dict=True)
            return model, cm, cr
        
        # Streamlit tabs
        tab1, tab2, tab3 = st.tabs(["Linear Kernel", "Polynomial Kernel", "RBF Kernel"])
        
        for tab, kernel in zip([tab1, tab2, tab3], ["linear", "poly", "rbf"]):
            with tab:
                st.write(f"## SVM with {kernel.capitalize()} Kernel")
                model, cm, cr = evaluate_svm(kernel)
                
                # Confusion matrix
                st.write("### Confusion Matrix")
                fig, ax = plt.subplots()
                sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
                plt.xlabel("Predicted")
                plt.ylabel("Actual")
                plt.title("Confusion Matrix")
                st.pyplot(fig)
                
                # Classification report
                st.write("### Classification Report")
                st.dataframe(pd.DataFrame(cr).transpose())
                
                # Decision boundary
                st.write("### Decision Boundary")
                visualize_classifier(model, X.to_numpy(), y.to_numpy(), title=f"Decision Boundary - {kernel.capitalize()} Kernel")
                
                # Explanation
                explanation = {
                    "linear": "The linear kernel performs well when the data is linearly separable.",
                    "poly": "The polynomial kernel captures more complex relationships but may overfit with high-degree polynomials.",
                    "rbf": "The RBF kernel is effective in capturing non-linear relationships in the data but requires careful tuning of parameters."
                }
                st.markdown(f"**Performance Analysis:** {explanation[kernel]}")

if __name__ == "__main__":
    main()