File size: 6,869 Bytes
3d81d0a
 
 
 
 
 
 
 
 
08befbb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1989436
 
 
3d81d0a
1989436
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3d81d0a
1989436
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report

# Function to visualize decision boundary
def visualize_classifier(classifier, X, y, title=''):
    min_x, max_x = X[:, 0].min() - 1.0, X[:, 0].max() + 1.0
    min_y, max_y = X[:, 1].min() - 1.0, X[:, 1].max() + 1.0
    mesh_step_size = 0.01
    x_vals, y_vals = np.meshgrid(np.arange(min_x, max_x, mesh_step_size), 
                                  np.arange(min_y, max_y, mesh_step_size))
    output = classifier.predict(np.c_[x_vals.ravel(), y_vals.ravel()])
    output = output.reshape(x_vals.shape)
    fig, ax = plt.subplots()
    ax.set_title(title)
    ax.pcolormesh(x_vals, y_vals, output, cmap=plt.cm.gray, shading='auto')
    ax.scatter(X[:, 0], X[:, 1], c=y, s=75, edgecolors='black', linewidth=1, cmap=plt.cm.Paired)
    ax.set_xlim(x_vals.min(), x_vals.max())
    ax.set_ylim(y_vals.min(), y_vals.max())
    ax.set_xticks(np.arange(int(X[:, 0].min() - 1), int(X[:, 0].max() + 1), 1.0))
    ax.set_yticks(np.arange(int(X[:, 1].min() - 1), int(X[:, 1].max() + 1), 1.0))
    st.pyplot(fig)

def main():
    # Load the dataset
    st.title("SVM Kernel Performance Comparison")

    about = """
    # 🧠 SVM Kernel Comparison: Understanding the Impact on Overlapped Data  

    In machine learning, **Support Vector Machines (SVMs)** are powerful classifiers that work well for both linear and non-linear decision boundaries. However, the performance of an SVM heavily depends on the **choice of kernel function**. Let's analyze how different kernels handle **overlapped data** and why choosing the right kernel is crucial.  

    ## πŸ” Kernel Performance Breakdown  

    ### 1️⃣ **Linear Kernel** 🟒  
    - πŸ“ Assumes the data is **linearly separable** (i.e., can be divided by a straight line).  
    - βœ… Works well when classes are well-separated.  
    - ❌ Struggles with highly overlapped data, leading to **poor generalization**.  
    - πŸš€ **Best for:** High-dimensional sparse data (e.g., text classification).  

    ### 2️⃣ **Polynomial Kernel** πŸ“ˆ  
    - πŸ”„ Expands feature space by computing polynomial combinations of features.  
    - βœ… Can model more complex decision boundaries.  
    - ❌ **High-degree polynomials** can lead to **overfitting**.  
    - πŸš€ **Best for:** Medium-complexity patterns where interactions between features matter.  

    ### 3️⃣ **Radial Basis Function (RBF) Kernel** πŸ”΅  
    - πŸ”₯ Uses **Gaussian similarity** to map data into a higher-dimensional space.  
    - βœ… Excels in handling **highly non-linear** and **overlapped** data.  
    - ❌ Requires careful tuning of the **gamma** parameter to avoid underfitting or overfitting.  
    - πŸš€ **Best for:** Complex, non-linear relationships (e.g., image classification).  

    ## 🎯 Choosing the Right Kernel  
    - If data is **linearly separable**, a **linear kernel** is efficient and interpretable.  
    - If data has **moderate overlap**, a **polynomial kernel** provides flexibility.  
    - If data is **highly overlapped and non-linear**, the **RBF kernel** is often the best choice.  

    ### πŸ€– Key Takeaway  
    The **right kernel choice** significantly impacts classification accuracy. While RBF is a strong default for **complex overlapped data**, simpler kernels should be preferred when appropriate to reduce computation cost and improve interpretability. **Experimentation and hyperparameter tuning are essential** to achieving the best results.  

    πŸ”Ž *β€œThere is no one-size-fits-all kernel – understanding your data is the key to unlocking SVM’s full potential!”* 
    
    πŸš€ Created by: Louie F.Cervantes, M.Eng. (Information Engineering)
    """
    with st.expander("About SVM Kernels"):
        st.markdown(about)

    uploaded_file = './data/overlapped.csv'
    if uploaded_file:
        df = pd.read_csv(uploaded_file)
        st.write("### Data Preview")
        st.dataframe(df)
        
        # Assuming the last column is the target
        X = df.iloc[:, :-1]
        y = df.iloc[:, -1]
        
        # Splitting dataset
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
        
        # Plot overlapped clusters
        st.write("### Cluster Visualization")
        fig, ax = plt.subplots()
        scatter = sns.scatterplot(x=X.iloc[:, 0], y=X.iloc[:, 1], hue=y, palette='coolwarm', alpha=0.6)
        plt.xlabel("Feature 1")
        plt.ylabel("Feature 2")
        plt.title("Overlapped Clusters")
        st.pyplot(fig)
        
        # Function to train SVM and get performance metrics
        def evaluate_svm(kernel_type):
            model = SVC(kernel=kernel_type)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            cm = confusion_matrix(y_test, y_pred)
            cr = classification_report(y_test, y_pred, output_dict=True)
            return model, cm, cr
        
        # Streamlit tabs
        tab1, tab2, tab3 = st.tabs(["Linear Kernel", "Polynomial Kernel", "RBF Kernel"])
        
        for tab, kernel in zip([tab1, tab2, tab3], ["linear", "poly", "rbf"]):
            with tab:
                st.write(f"## SVM with {kernel.capitalize()} Kernel")
                model, cm, cr = evaluate_svm(kernel)
                
                # Confusion matrix
                st.write("### Confusion Matrix")
                fig, ax = plt.subplots()
                sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
                plt.xlabel("Predicted")
                plt.ylabel("Actual")
                plt.title("Confusion Matrix")
                st.pyplot(fig)
                
                # Classification report
                st.write("### Classification Report")
                st.dataframe(pd.DataFrame(cr).transpose())
                
                # Decision boundary
                st.write("### Decision Boundary")
                visualize_classifier(model, X.to_numpy(), y.to_numpy(), title=f"Decision Boundary - {kernel.capitalize()} Kernel")
                
                # Explanation
                explanation = {
                    "linear": "The linear kernel performs well when the data is linearly separable.",
                    "poly": "The polynomial kernel captures more complex relationships but may overfit with high-degree polynomials.",
                    "rbf": "The RBF kernel is effective in capturing non-linear relationships in the data but requires careful tuning of parameters."
                }
                st.markdown(f"**Performance Analysis:** {explanation[kernel]}")

if __name__ == "__main__":
    main()