Spaces:
Sleeping
Sleeping
File size: 6,869 Bytes
3d81d0a 08befbb 1989436 3d81d0a 1989436 3d81d0a 1989436 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report
# Function to visualize decision boundary
def visualize_classifier(classifier, X, y, title=''):
min_x, max_x = X[:, 0].min() - 1.0, X[:, 0].max() + 1.0
min_y, max_y = X[:, 1].min() - 1.0, X[:, 1].max() + 1.0
mesh_step_size = 0.01
x_vals, y_vals = np.meshgrid(np.arange(min_x, max_x, mesh_step_size),
np.arange(min_y, max_y, mesh_step_size))
output = classifier.predict(np.c_[x_vals.ravel(), y_vals.ravel()])
output = output.reshape(x_vals.shape)
fig, ax = plt.subplots()
ax.set_title(title)
ax.pcolormesh(x_vals, y_vals, output, cmap=plt.cm.gray, shading='auto')
ax.scatter(X[:, 0], X[:, 1], c=y, s=75, edgecolors='black', linewidth=1, cmap=plt.cm.Paired)
ax.set_xlim(x_vals.min(), x_vals.max())
ax.set_ylim(y_vals.min(), y_vals.max())
ax.set_xticks(np.arange(int(X[:, 0].min() - 1), int(X[:, 0].max() + 1), 1.0))
ax.set_yticks(np.arange(int(X[:, 1].min() - 1), int(X[:, 1].max() + 1), 1.0))
st.pyplot(fig)
def main():
# Load the dataset
st.title("SVM Kernel Performance Comparison")
about = """
# π§ SVM Kernel Comparison: Understanding the Impact on Overlapped Data
In machine learning, **Support Vector Machines (SVMs)** are powerful classifiers that work well for both linear and non-linear decision boundaries. However, the performance of an SVM heavily depends on the **choice of kernel function**. Let's analyze how different kernels handle **overlapped data** and why choosing the right kernel is crucial.
## π Kernel Performance Breakdown
### 1οΈβ£ **Linear Kernel** π’
- π Assumes the data is **linearly separable** (i.e., can be divided by a straight line).
- β
Works well when classes are well-separated.
- β Struggles with highly overlapped data, leading to **poor generalization**.
- π **Best for:** High-dimensional sparse data (e.g., text classification).
### 2οΈβ£ **Polynomial Kernel** π
- π Expands feature space by computing polynomial combinations of features.
- β
Can model more complex decision boundaries.
- β **High-degree polynomials** can lead to **overfitting**.
- π **Best for:** Medium-complexity patterns where interactions between features matter.
### 3οΈβ£ **Radial Basis Function (RBF) Kernel** π΅
- π₯ Uses **Gaussian similarity** to map data into a higher-dimensional space.
- β
Excels in handling **highly non-linear** and **overlapped** data.
- β Requires careful tuning of the **gamma** parameter to avoid underfitting or overfitting.
- π **Best for:** Complex, non-linear relationships (e.g., image classification).
## π― Choosing the Right Kernel
- If data is **linearly separable**, a **linear kernel** is efficient and interpretable.
- If data has **moderate overlap**, a **polynomial kernel** provides flexibility.
- If data is **highly overlapped and non-linear**, the **RBF kernel** is often the best choice.
### π€ Key Takeaway
The **right kernel choice** significantly impacts classification accuracy. While RBF is a strong default for **complex overlapped data**, simpler kernels should be preferred when appropriate to reduce computation cost and improve interpretability. **Experimentation and hyperparameter tuning are essential** to achieving the best results.
π *βThere is no one-size-fits-all kernel β understanding your data is the key to unlocking SVMβs full potential!β*
π Created by: Louie F.Cervantes, M.Eng. (Information Engineering)
"""
with st.expander("About SVM Kernels"):
st.markdown(about)
uploaded_file = './data/overlapped.csv'
if uploaded_file:
df = pd.read_csv(uploaded_file)
st.write("### Data Preview")
st.dataframe(df)
# Assuming the last column is the target
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Plot overlapped clusters
st.write("### Cluster Visualization")
fig, ax = plt.subplots()
scatter = sns.scatterplot(x=X.iloc[:, 0], y=X.iloc[:, 1], hue=y, palette='coolwarm', alpha=0.6)
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.title("Overlapped Clusters")
st.pyplot(fig)
# Function to train SVM and get performance metrics
def evaluate_svm(kernel_type):
model = SVC(kernel=kernel_type)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred, output_dict=True)
return model, cm, cr
# Streamlit tabs
tab1, tab2, tab3 = st.tabs(["Linear Kernel", "Polynomial Kernel", "RBF Kernel"])
for tab, kernel in zip([tab1, tab2, tab3], ["linear", "poly", "rbf"]):
with tab:
st.write(f"## SVM with {kernel.capitalize()} Kernel")
model, cm, cr = evaluate_svm(kernel)
# Confusion matrix
st.write("### Confusion Matrix")
fig, ax = plt.subplots()
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
st.pyplot(fig)
# Classification report
st.write("### Classification Report")
st.dataframe(pd.DataFrame(cr).transpose())
# Decision boundary
st.write("### Decision Boundary")
visualize_classifier(model, X.to_numpy(), y.to_numpy(), title=f"Decision Boundary - {kernel.capitalize()} Kernel")
# Explanation
explanation = {
"linear": "The linear kernel performs well when the data is linearly separable.",
"poly": "The polynomial kernel captures more complex relationships but may overfit with high-degree polynomials.",
"rbf": "The RBF kernel is effective in capturing non-linear relationships in the data but requires careful tuning of parameters."
}
st.markdown(f"**Performance Analysis:** {explanation[kernel]}")
if __name__ == "__main__":
main() |