|
import os |
|
import streamlit as st |
|
from datasets import load_dataset |
|
import pandas as pd |
|
from huggingface_hub import login |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.naive_bayes import GaussianNB |
|
from sklearn.tree import DecisionTreeClassifier |
|
from sklearn.neighbors import KNeighborsClassifier |
|
from sklearn.metrics import confusion_matrix, classification_report |
|
from sklearn.preprocessing import LabelEncoder |
|
import seaborn as sns |
|
import matplotlib.pyplot as plt |
|
import numpy as np |
|
from PIL import Image |
|
|
|
|
|
dataset_name = "louiecerv/unsw-nb15-preprocessed" |
|
|
|
|
|
hf_token = os.getenv("HF_TOKEN") |
|
|
|
if not hf_token: |
|
st.error("HF_TOKEN environment variable is not set. Please set it before running the app.") |
|
st.stop() |
|
|
|
|
|
login(token=hf_token) |
|
|
|
st.title("🛡️📈 Cyber Attack Detection ML Approach") |
|
st.write("This app loads and analyzes the UNSW_NB15_training-set.csv dataset.") |
|
|
|
|
|
image = Image.open("cyber_attack.jpg") |
|
st.image(image, caption="Cybersecurity", use_container_width=True) |
|
|
|
about = """ |
|
# About This App |
|
|
|
This Streamlit app provides an interactive analysis of the UNSW-NB15 dataset, a popular benchmark for evaluating network intrusion detection systems. The app leverages machine learning techniques to classify network traffic as either normal or indicative of various attack types. |
|
|
|
## About the UNSW-NB15 Dataset |
|
|
|
The UNSW-NB15 dataset was created by the Cyber Security Lab at the University of New South Wales, Canberra. It's a comprehensive dataset containing network traffic captures (tcpdump) and system call traces. The dataset includes a variety of modern attack types, making it a valuable resource for training and testing intrusion detection systems. Key features of the dataset include: |
|
|
|
* **Diverse Attack Types:** Covers a wide range of attacks such as Fuzzers, Backdoor, DoS, Exploits, Generic, Reconnaissance, Shellcode, and Worms. |
|
* **Realistic Network Traffic:** Generated using a realistic network environment, simulating real-world scenarios. |
|
* **Labeled Data:** Each network flow is labeled with its corresponding attack type or as normal traffic, enabling supervised learning. |
|
|
|
## App Purpose |
|
|
|
This app aims to: |
|
|
|
1. **Visualize and Explore the Data:** Provide an interface to view the dataset's structure, data types, and descriptive statistics. This allows users to understand the characteristics of the UNSW-NB15 dataset. |
|
|
|
2. **Train and Evaluate Machine Learning Models:** Implement and compare the performance of several machine learning classifiers, specifically: |
|
* Naive Bayes |
|
* Decision Tree |
|
* K-Nearest Neighbors |
|
|
|
3. **Analyze Model Performance:** Present confusion matrices and classification reports to evaluate the effectiveness of each model in detecting different attack types. This helps users understand the strengths and weaknesses of each algorithm. |
|
|
|
4. **Facilitate Learning:** Serve as an educational tool for learning about network intrusion detection, machine learning classification, and dataset analysis. |
|
""" |
|
with st.expander("About this App"): |
|
st.markdown(about) |
|
|
|
|
|
try: |
|
with st.spinner("Loading dataset..."): |
|
dataset = load_dataset(dataset_name) |
|
st.success("Dataset loaded successfully.") |
|
except ValueError: |
|
st.error("Dataset not found or incorrect dataset name. Please check the dataset identifier.") |
|
st.stop() |
|
except PermissionError: |
|
st.error("Authentication failed. Check if your Hugging Face token is correct.") |
|
st.stop() |
|
except Exception as e: |
|
st.error(f"Unexpected error: {e}") |
|
st.stop() |
|
|
|
df = dataset["train"].to_pandas() |
|
st.write(f"### Train Split") |
|
st.write(f"Shape: {df.shape}") |
|
st.dataframe(df.head()) |
|
|
|
|
|
for col in df.columns: |
|
if df[col].dtype == 'object': |
|
df[col] = df[col].astype(str) |
|
elif df[col].dtype in ['float64', 'int64']: |
|
df[col] = pd.to_numeric(df[col], errors='coerce') |
|
|
|
|
|
df.replace([np.inf, -np.inf], np.nan, inplace=True) |
|
df.fillna(0, inplace=True) |
|
|
|
if "df" not in st.session_state: |
|
st.session_state.df = df |
|
|
|
def run_model(model, model_name): |
|
with st.spinner(f"Training {model_name}..."): |
|
df = st.session_state.df |
|
st.header(f"{model_name} Classifier") |
|
|
|
|
|
X = df.drop(columns=['label']) |
|
X = df.drop(columns=['attack_cat']) |
|
|
|
y = df['attack_cat'] |
|
|
|
|
|
le = LabelEncoder() |
|
|
|
|
|
y_encoded = le.fit_transform(y) |
|
|
|
|
|
print("Original labels:", y.values) |
|
print("Encoded labels:", y_encoded) |
|
print("Label mapping:", dict(zip(le.classes_, le.transform(le.classes_)))) |
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42) |
|
|
|
|
|
model.fit(X_train, y_train) |
|
|
|
|
|
y_pred = model.predict(X_test) |
|
|
|
|
|
y_pred_original = le.inverse_transform(y_pred) |
|
y_test_original = le.inverse_transform(y_test) |
|
|
|
|
|
st.write("## Confusion Matrix") |
|
cm = confusion_matrix(y_test_original, y_pred_original) |
|
fig, ax = plt.subplots(figsize=(10, 8)) |
|
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues') |
|
plt.xlabel('Predicted') |
|
plt.ylabel('True') |
|
st.pyplot(fig) |
|
|
|
|
|
st.write("## Classification Report") |
|
report = classification_report(y_test_original, y_pred_original, output_dict=True, zero_division=1) |
|
report_df = pd.DataFrame(report).transpose() |
|
st.table(report_df) |
|
|
|
def main(): |
|
|
|
|
|
df = st.session_state.df |
|
|
|
|
|
tabs = st.tabs(["Dataset", "Naive Bayes", "Decision Tree", "K-Nearest Neighbor"]) |
|
|
|
|
|
for i, tab in enumerate(tabs): |
|
with tab: |
|
if i == 0: |
|
st.header("Dataset Information") |
|
st.write("## Data Types") |
|
st.write(df.dtypes) |
|
st.write("## Statistical Overview") |
|
st.write(df.describe()) |
|
elif i == 1: |
|
run_model(GaussianNB(), "Naive Bayes") |
|
elif i == 2: |
|
run_model(DecisionTreeClassifier(), "Decision Tree") |
|
elif i == 3: |
|
run_model(KNeighborsClassifier(), "K-Nearest Neighbor") |
|
|
|
if __name__ == "__main__": |
|
main() |