Spaces:

louiecerv
/

cyberattack_detection_ml_approach

Running

App Files Files Community

cyberattack_detection_ml_approach / app.py

louiecerv

updated the readme file

2f7249c 4 months ago

raw

history blame contribute delete

7.08 kB

	import os
	import streamlit as st
	from datasets import load_dataset
	import pandas as pd
	from huggingface_hub import login
	from sklearn.model_selection import train_test_split
	from sklearn.naive_bayes import GaussianNB
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.metrics import confusion_matrix, classification_report
	from sklearn.preprocessing import LabelEncoder
	import seaborn as sns
	import matplotlib.pyplot as plt
	import numpy as np
	from PIL import Image

	# Streamlit UI
	dataset_name = "louiecerv/unsw-nb15-preprocessed"

	# Retrieve Hugging Face token from environment variable
	hf_token = os.getenv("HF_TOKEN")

	if not hf_token:
	st.error("HF_TOKEN environment variable is not set. Please set it before running the app.")
	st.stop()

	# Login to Hugging Face Hub
	login(token=hf_token)

	st.title("🛡️📈 Cyber Attack Detection ML Approach")
	st.write("This app loads and analyzes the UNSW_NB15_training-set.csv dataset.")

	# Display image
	image = Image.open("cyber_attack.jpg")
	st.image(image, caption="Cybersecurity", use_container_width=True)

	about = """
	# About This App

	This Streamlit app provides an interactive analysis of the UNSW-NB15 dataset, a popular benchmark for evaluating network intrusion detection systems. The app leverages machine learning techniques to classify network traffic as either normal or indicative of various attack types.

	## About the UNSW-NB15 Dataset

	The UNSW-NB15 dataset was created by the Cyber Security Lab at the University of New South Wales, Canberra. It's a comprehensive dataset containing network traffic captures (tcpdump) and system call traces. The dataset includes a variety of modern attack types, making it a valuable resource for training and testing intrusion detection systems. Key features of the dataset include:

	* Diverse Attack Types: Covers a wide range of attacks such as Fuzzers, Backdoor, DoS, Exploits, Generic, Reconnaissance, Shellcode, and Worms.
	* Realistic Network Traffic: Generated using a realistic network environment, simulating real-world scenarios.
	* Labeled Data: Each network flow is labeled with its corresponding attack type or as normal traffic, enabling supervised learning.

	## App Purpose

	This app aims to:

	1. Visualize and Explore the Data: Provide an interface to view the dataset's structure, data types, and descriptive statistics. This allows users to understand the characteristics of the UNSW-NB15 dataset.

	2. Train and Evaluate Machine Learning Models: Implement and compare the performance of several machine learning classifiers, specifically:
	* Naive Bayes
	* Decision Tree
	* K-Nearest Neighbors

	3. Analyze Model Performance: Present confusion matrices and classification reports to evaluate the effectiveness of each model in detecting different attack types. This helps users understand the strengths and weaknesses of each algorithm.

	4. Facilitate Learning: Serve as an educational tool for learning about network intrusion detection, machine learning classification, and dataset analysis.
	"""
	with st.expander("About this App"):
	st.markdown(about)

	# Load dataset
	try:
	with st.spinner("Loading dataset..."):
	dataset = load_dataset(dataset_name)
	st.success("Dataset loaded successfully.")
	except ValueError:
	st.error("Dataset not found or incorrect dataset name. Please check the dataset identifier.")
	st.stop()
	except PermissionError:
	st.error("Authentication failed. Check if your Hugging Face token is correct.")
	st.stop()
	except Exception as e:
	st.error(f"Unexpected error: {e}")
	st.stop()

	df = dataset["train"].to_pandas()
	st.write(f"### Train Split")
	st.write(f"Shape: {df.shape}")
	st.dataframe(df.head())

	# Convert mixed-type columns to string or numeric
	for col in df.columns:
	if df[col].dtype == 'object': # Likely mixed types
	df[col] = df[col].astype(str)
	elif df[col].dtype in ['float64', 'int64']:
	df[col] = pd.to_numeric(df[col], errors='coerce') # Force numeric values

	# Replace inf values and NaNs
	df.replace([np.inf, -np.inf], np.nan, inplace=True) # Replace infinities with NaN
	df.fillna(0, inplace=True) # Replace NaNs with 0 or an appropriate default value

	if "df" not in st.session_state:
	st.session_state.df = df

	def run_model(model, model_name):
	with st.spinner(f"Training {model_name}..."):
	df = st.session_state.df
	st.header(f"{model_name} Classifier")

	# Prepare data (assuming 'label' is the target variable)
	X = df.drop(columns=['label']) # Another target variable we don't want to include
	X = df.drop(columns=['attack_cat']) # Features

	y = df['attack_cat'] # Target

	# Initialize LabelEncoder
	le = LabelEncoder()

	# Fit and transform the target variable
	y_encoded = le.fit_transform(y)

	# Print the encoded labels and mapping for verification (optional)
	print("Original labels:", y.values)
	print("Encoded labels:", y_encoded)
	print("Label mapping:", dict(zip(le.classes_, le.transform(le.classes_))))

	# Split data
	X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

	# Train the model
	model.fit(X_train, y_train)

	# Predict
	y_pred = model.predict(X_test)

	# Reverse the encoding of y_pred
	y_pred_original = le.inverse_transform(y_pred)
	y_test_original = le.inverse_transform(y_test)

	# Show confusion matrix
	st.write("## Confusion Matrix")
	cm = confusion_matrix(y_test_original, y_pred_original)
	fig, ax = plt.subplots(figsize=(10, 8))
	sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
	plt.xlabel('Predicted')
	plt.ylabel('True')
	st.pyplot(fig)

	# Show classification report
	st.write("## Classification Report")
	report = classification_report(y_test_original, y_pred_original, output_dict=True, zero_division=1)
	report_df = pd.DataFrame(report).transpose()
	st.table(report_df)

	def main():

	# Load the dataset
	df = st.session_state.df

	# Create tabs
	tabs = st.tabs(["Dataset", "Naive Bayes", "Decision Tree", "K-Nearest Neighbor"])

	# Iterate through the tabs
	for i, tab in enumerate(tabs):
	with tab: # Use 'with tab:' here
	if i == 0: # Dataset tab
	st.header("Dataset Information")
	st.write("## Data Types")
	st.write(df.dtypes)
	st.write("## Statistical Overview")
	st.write(df.describe())
	elif i == 1: # Naive Bayes tab
	run_model(GaussianNB(), "Naive Bayes")
	elif i == 2: # Decision Tree tab
	run_model(DecisionTreeClassifier(), "Decision Tree")
	elif i == 3: # K-Nearest Neighbor tab
	run_model(KNeighborsClassifier(), "K-Nearest Neighbor")

	if __name__ == "__main__":
	main()