Spaces:

louiecerv
/

svm_kernel_comparison

Sleeping

App Files Files Community

svm_kernel_comparison / app.py

louiecerv

Added the app descrption

1989436 6 months ago

raw

history blame contribute delete

6.87 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	from sklearn.model_selection import train_test_split
	from sklearn.svm import SVC
	from sklearn.metrics import confusion_matrix, classification_report

	# Function to visualize decision boundary
	def visualize_classifier(classifier, X, y, title=''):
	min_x, max_x = X[:, 0].min() - 1.0, X[:, 0].max() + 1.0
	min_y, max_y = X[:, 1].min() - 1.0, X[:, 1].max() + 1.0
	mesh_step_size = 0.01
	x_vals, y_vals = np.meshgrid(np.arange(min_x, max_x, mesh_step_size),
	np.arange(min_y, max_y, mesh_step_size))
	output = classifier.predict(np.c_[x_vals.ravel(), y_vals.ravel()])
	output = output.reshape(x_vals.shape)
	fig, ax = plt.subplots()
	ax.set_title(title)
	ax.pcolormesh(x_vals, y_vals, output, cmap=plt.cm.gray, shading='auto')
	ax.scatter(X[:, 0], X[:, 1], c=y, s=75, edgecolors='black', linewidth=1, cmap=plt.cm.Paired)
	ax.set_xlim(x_vals.min(), x_vals.max())
	ax.set_ylim(y_vals.min(), y_vals.max())
	ax.set_xticks(np.arange(int(X[:, 0].min() - 1), int(X[:, 0].max() + 1), 1.0))
	ax.set_yticks(np.arange(int(X[:, 1].min() - 1), int(X[:, 1].max() + 1), 1.0))
	st.pyplot(fig)

	def main():
	# Load the dataset
	st.title("SVM Kernel Performance Comparison")

	about = """
	# 🧠 SVM Kernel Comparison: Understanding the Impact on Overlapped Data

	In machine learning, Support Vector Machines (SVMs) are powerful classifiers that work well for both linear and non-linear decision boundaries. However, the performance of an SVM heavily depends on the choice of kernel function. Let's analyze how different kernels handle overlapped data and why choosing the right kernel is crucial.

	## 🔍 Kernel Performance Breakdown

	### 1️⃣ Linear Kernel 🟢
	- 📏 Assumes the data is linearly separable (i.e., can be divided by a straight line).
	- ✅ Works well when classes are well-separated.
	- ❌ Struggles with highly overlapped data, leading to poor generalization.
	- 🚀 Best for: High-dimensional sparse data (e.g., text classification).

	### 2️⃣ Polynomial Kernel 📈
	- 🔄 Expands feature space by computing polynomial combinations of features.
	- ✅ Can model more complex decision boundaries.
	- ❌ High-degree polynomials can lead to overfitting.
	- 🚀 Best for: Medium-complexity patterns where interactions between features matter.

	### 3️⃣ Radial Basis Function (RBF) Kernel 🔵
	- 🔥 Uses Gaussian similarity to map data into a higher-dimensional space.
	- ✅ Excels in handling highly non-linear and overlapped data.
	- ❌ Requires careful tuning of the gamma parameter to avoid underfitting or overfitting.
	- 🚀 Best for: Complex, non-linear relationships (e.g., image classification).

	## 🎯 Choosing the Right Kernel
	- If data is linearly separable, a linear kernel is efficient and interpretable.
	- If data has moderate overlap, a polynomial kernel provides flexibility.
	- If data is highly overlapped and non-linear, the RBF kernel is often the best choice.

	### 🤖 Key Takeaway
	The right kernel choice significantly impacts classification accuracy. While RBF is a strong default for complex overlapped data, simpler kernels should be preferred when appropriate to reduce computation cost and improve interpretability. Experimentation and hyperparameter tuning are essential to achieving the best results.

	🔎 “There is no one-size-fits-all kernel – understanding your data is the key to unlocking SVM’s full potential!”

	🚀 Created by: Louie F.Cervantes, M.Eng. (Information Engineering)
	"""
	with st.expander("About SVM Kernels"):
	st.markdown(about)

	uploaded_file = './data/overlapped.csv'
	if uploaded_file:
	df = pd.read_csv(uploaded_file)
	st.write("### Data Preview")
	st.dataframe(df)

	# Assuming the last column is the target
	X = df.iloc[:, :-1]
	y = df.iloc[:, -1]

	# Splitting dataset
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

	# Plot overlapped clusters
	st.write("### Cluster Visualization")
	fig, ax = plt.subplots()
	scatter = sns.scatterplot(x=X.iloc[:, 0], y=X.iloc[:, 1], hue=y, palette='coolwarm', alpha=0.6)
	plt.xlabel("Feature 1")
	plt.ylabel("Feature 2")
	plt.title("Overlapped Clusters")
	st.pyplot(fig)

	# Function to train SVM and get performance metrics
	def evaluate_svm(kernel_type):
	model = SVC(kernel=kernel_type)
	model.fit(X_train, y_train)
	y_pred = model.predict(X_test)
	cm = confusion_matrix(y_test, y_pred)
	cr = classification_report(y_test, y_pred, output_dict=True)
	return model, cm, cr

	# Streamlit tabs
	tab1, tab2, tab3 = st.tabs(["Linear Kernel", "Polynomial Kernel", "RBF Kernel"])

	for tab, kernel in zip([tab1, tab2, tab3], ["linear", "poly", "rbf"]):
	with tab:
	st.write(f"## SVM with {kernel.capitalize()} Kernel")
	model, cm, cr = evaluate_svm(kernel)

	# Confusion matrix
	st.write("### Confusion Matrix")
	fig, ax = plt.subplots()
	sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
	plt.xlabel("Predicted")
	plt.ylabel("Actual")
	plt.title("Confusion Matrix")
	st.pyplot(fig)

	# Classification report
	st.write("### Classification Report")
	st.dataframe(pd.DataFrame(cr).transpose())

	# Decision boundary
	st.write("### Decision Boundary")
	visualize_classifier(model, X.to_numpy(), y.to_numpy(), title=f"Decision Boundary - {kernel.capitalize()} Kernel")

	# Explanation
	explanation = {
	"linear": "The linear kernel performs well when the data is linearly separable.",
	"poly": "The polynomial kernel captures more complex relationships but may overfit with high-degree polynomials.",
	"rbf": "The RBF kernel is effective in capturing non-linear relationships in the data but requires careful tuning of parameters."
	}
	st.markdown(f"Performance Analysis: {explanation[kernel]}")

	if __name__ == "__main__":
	main()