Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from sklearn.model_selection import train_test_split | |
from sklearn.svm import SVC | |
from sklearn.metrics import confusion_matrix, classification_report | |
# Function to visualize decision boundary | |
def visualize_classifier(classifier, X, y, title=''): | |
min_x, max_x = X[:, 0].min() - 1.0, X[:, 0].max() + 1.0 | |
min_y, max_y = X[:, 1].min() - 1.0, X[:, 1].max() + 1.0 | |
mesh_step_size = 0.01 | |
x_vals, y_vals = np.meshgrid(np.arange(min_x, max_x, mesh_step_size), | |
np.arange(min_y, max_y, mesh_step_size)) | |
output = classifier.predict(np.c_[x_vals.ravel(), y_vals.ravel()]) | |
output = output.reshape(x_vals.shape) | |
fig, ax = plt.subplots() | |
ax.set_title(title) | |
ax.pcolormesh(x_vals, y_vals, output, cmap=plt.cm.gray, shading='auto') | |
ax.scatter(X[:, 0], X[:, 1], c=y, s=75, edgecolors='black', linewidth=1, cmap=plt.cm.Paired) | |
ax.set_xlim(x_vals.min(), x_vals.max()) | |
ax.set_ylim(y_vals.min(), y_vals.max()) | |
ax.set_xticks(np.arange(int(X[:, 0].min() - 1), int(X[:, 0].max() + 1), 1.0)) | |
ax.set_yticks(np.arange(int(X[:, 1].min() - 1), int(X[:, 1].max() + 1), 1.0)) | |
st.pyplot(fig) | |
def main(): | |
# Load the dataset | |
st.title("SVM Kernel Performance Comparison") | |
about = """ | |
# π§ SVM Kernel Comparison: Understanding the Impact on Overlapped Data | |
In machine learning, **Support Vector Machines (SVMs)** are powerful classifiers that work well for both linear and non-linear decision boundaries. However, the performance of an SVM heavily depends on the **choice of kernel function**. Let's analyze how different kernels handle **overlapped data** and why choosing the right kernel is crucial. | |
## π Kernel Performance Breakdown | |
### 1οΈβ£ **Linear Kernel** π’ | |
- π Assumes the data is **linearly separable** (i.e., can be divided by a straight line). | |
- β Works well when classes are well-separated. | |
- β Struggles with highly overlapped data, leading to **poor generalization**. | |
- π **Best for:** High-dimensional sparse data (e.g., text classification). | |
### 2οΈβ£ **Polynomial Kernel** π | |
- π Expands feature space by computing polynomial combinations of features. | |
- β Can model more complex decision boundaries. | |
- β **High-degree polynomials** can lead to **overfitting**. | |
- π **Best for:** Medium-complexity patterns where interactions between features matter. | |
### 3οΈβ£ **Radial Basis Function (RBF) Kernel** π΅ | |
- π₯ Uses **Gaussian similarity** to map data into a higher-dimensional space. | |
- β Excels in handling **highly non-linear** and **overlapped** data. | |
- β Requires careful tuning of the **gamma** parameter to avoid underfitting or overfitting. | |
- π **Best for:** Complex, non-linear relationships (e.g., image classification). | |
## π― Choosing the Right Kernel | |
- If data is **linearly separable**, a **linear kernel** is efficient and interpretable. | |
- If data has **moderate overlap**, a **polynomial kernel** provides flexibility. | |
- If data is **highly overlapped and non-linear**, the **RBF kernel** is often the best choice. | |
### π€ Key Takeaway | |
The **right kernel choice** significantly impacts classification accuracy. While RBF is a strong default for **complex overlapped data**, simpler kernels should be preferred when appropriate to reduce computation cost and improve interpretability. **Experimentation and hyperparameter tuning are essential** to achieving the best results. | |
π *βThere is no one-size-fits-all kernel β understanding your data is the key to unlocking SVMβs full potential!β* | |
π Created by: Louie F.Cervantes, M.Eng. (Information Engineering) | |
""" | |
with st.expander("About SVM Kernels"): | |
st.markdown(about) | |
uploaded_file = './data/overlapped.csv' | |
if uploaded_file: | |
df = pd.read_csv(uploaded_file) | |
st.write("### Data Preview") | |
st.dataframe(df) | |
# Assuming the last column is the target | |
X = df.iloc[:, :-1] | |
y = df.iloc[:, -1] | |
# Splitting dataset | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) | |
# Plot overlapped clusters | |
st.write("### Cluster Visualization") | |
fig, ax = plt.subplots() | |
scatter = sns.scatterplot(x=X.iloc[:, 0], y=X.iloc[:, 1], hue=y, palette='coolwarm', alpha=0.6) | |
plt.xlabel("Feature 1") | |
plt.ylabel("Feature 2") | |
plt.title("Overlapped Clusters") | |
st.pyplot(fig) | |
# Function to train SVM and get performance metrics | |
def evaluate_svm(kernel_type): | |
model = SVC(kernel=kernel_type) | |
model.fit(X_train, y_train) | |
y_pred = model.predict(X_test) | |
cm = confusion_matrix(y_test, y_pred) | |
cr = classification_report(y_test, y_pred, output_dict=True) | |
return model, cm, cr | |
# Streamlit tabs | |
tab1, tab2, tab3 = st.tabs(["Linear Kernel", "Polynomial Kernel", "RBF Kernel"]) | |
for tab, kernel in zip([tab1, tab2, tab3], ["linear", "poly", "rbf"]): | |
with tab: | |
st.write(f"## SVM with {kernel.capitalize()} Kernel") | |
model, cm, cr = evaluate_svm(kernel) | |
# Confusion matrix | |
st.write("### Confusion Matrix") | |
fig, ax = plt.subplots() | |
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues') | |
plt.xlabel("Predicted") | |
plt.ylabel("Actual") | |
plt.title("Confusion Matrix") | |
st.pyplot(fig) | |
# Classification report | |
st.write("### Classification Report") | |
st.dataframe(pd.DataFrame(cr).transpose()) | |
# Decision boundary | |
st.write("### Decision Boundary") | |
visualize_classifier(model, X.to_numpy(), y.to_numpy(), title=f"Decision Boundary - {kernel.capitalize()} Kernel") | |
# Explanation | |
explanation = { | |
"linear": "The linear kernel performs well when the data is linearly separable.", | |
"poly": "The polynomial kernel captures more complex relationships but may overfit with high-degree polynomials.", | |
"rbf": "The RBF kernel is effective in capturing non-linear relationships in the data but requires careful tuning of parameters." | |
} | |
st.markdown(f"**Performance Analysis:** {explanation[kernel]}") | |
if __name__ == "__main__": | |
main() |