Spaces:

louiecerv
/

cyberattack_detection_ml_approach

Sleeping

File size: 7,081 Bytes

import os
import streamlit as st
from datasets import load_dataset
import pandas as pd
from huggingface_hub import login
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image

# Streamlit UI
dataset_name = "louiecerv/unsw-nb15-preprocessed"

# Retrieve Hugging Face token from environment variable
hf_token = os.getenv("HF_TOKEN")

if not hf_token:
    st.error("HF_TOKEN environment variable is not set. Please set it before running the app.")
    st.stop()

# Login to Hugging Face Hub
login(token=hf_token)

st.title("🛡️📈 Cyber Attack Detection ML Approach")
st.write("This app loads and analyzes the UNSW_NB15_training-set.csv dataset.")

# Display image
image = Image.open("cyber_attack.jpg")
st.image(image, caption="Cybersecurity", use_container_width=True)

about = """
# About This App

This Streamlit app provides an interactive analysis of the UNSW-NB15 dataset, a popular benchmark for evaluating network intrusion detection systems.  The app leverages machine learning techniques to classify network traffic as either normal or indicative of various attack types.

## About the UNSW-NB15 Dataset

The UNSW-NB15 dataset was created by the Cyber Security Lab at the University of New South Wales, Canberra. It's a comprehensive dataset containing network traffic captures (tcpdump) and system call traces.  The dataset includes a variety of modern attack types, making it a valuable resource for training and testing intrusion detection systems.  Key features of the dataset include:

* **Diverse Attack Types:**  Covers a wide range of attacks such as Fuzzers, Backdoor, DoS, Exploits, Generic, Reconnaissance, Shellcode, and Worms.
* **Realistic Network Traffic:**  Generated using a realistic network environment, simulating real-world scenarios.
* **Labeled Data:**  Each network flow is labeled with its corresponding attack type or as normal traffic, enabling supervised learning.

## App Purpose

This app aims to:

1. **Visualize and Explore the Data:**  Provide an interface to view the dataset's structure, data types, and descriptive statistics.  This allows users to understand the characteristics of the UNSW-NB15 dataset.

2. **Train and Evaluate Machine Learning Models:**  Implement and compare the performance of several machine learning classifiers, specifically:
    * Naive Bayes
    * Decision Tree
    * K-Nearest Neighbors

3. **Analyze Model Performance:**  Present confusion matrices and classification reports to evaluate the effectiveness of each model in detecting different attack types.  This helps users understand the strengths and weaknesses of each algorithm.

4. **Facilitate Learning:**  Serve as an educational tool for learning about network intrusion detection, machine learning classification, and dataset analysis.
"""
with st.expander("About this App"):
    st.markdown(about)

# Load dataset
try:
    with st.spinner("Loading dataset..."):
        dataset = load_dataset(dataset_name)
        st.success("Dataset loaded successfully.")
except ValueError:
    st.error("Dataset not found or incorrect dataset name. Please check the dataset identifier.")
    st.stop()
except PermissionError:
    st.error("Authentication failed. Check if your Hugging Face token is correct.")
    st.stop()
except Exception as e:
    st.error(f"Unexpected error: {e}")
    st.stop()

df = dataset["train"].to_pandas()
st.write(f"### Train Split")
st.write(f"Shape: {df.shape}")
st.dataframe(df.head())

# Convert mixed-type columns to string or numeric
for col in df.columns:
    if df[col].dtype == 'object':  # Likely mixed types
        df[col] = df[col].astype(str)
    elif df[col].dtype in ['float64', 'int64']:
        df[col] = pd.to_numeric(df[col], errors='coerce')  # Force numeric values

# Replace inf values and NaNs
df.replace([np.inf, -np.inf], np.nan, inplace=True)  # Replace infinities with NaN
df.fillna(0, inplace=True)  # Replace NaNs with 0 or an appropriate default value

if "df" not in st.session_state:
    st.session_state.df = df

def run_model(model, model_name):
    with st.spinner(f"Training {model_name}..."):
        df = st.session_state.df
        st.header(f"{model_name} Classifier")

        # Prepare data (assuming 'label' is the target variable)
        X = df.drop(columns=['label'])  # Another target variable we don't want to include
        X = df.drop(columns=['attack_cat'])  # Features

        y = df['attack_cat']  # Target

        # Initialize LabelEncoder
        le = LabelEncoder()

        # Fit and transform the target variable
        y_encoded = le.fit_transform(y)

        # Print the encoded labels and mapping for verification (optional)
        print("Original labels:", y.values)
        print("Encoded labels:", y_encoded)
        print("Label mapping:", dict(zip(le.classes_, le.transform(le.classes_))))

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

        # Train the model
        model.fit(X_train, y_train)

        # Predict
        y_pred = model.predict(X_test)

        # Reverse the encoding of y_pred
        y_pred_original = le.inverse_transform(y_pred)
        y_test_original = le.inverse_transform(y_test)

        # Show confusion matrix
        st.write("## Confusion Matrix")
        cm = confusion_matrix(y_test_original, y_pred_original)
        fig, ax = plt.subplots(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.xlabel('Predicted')
        plt.ylabel('True')
        st.pyplot(fig)

        # Show classification report
        st.write("## Classification Report")
        report = classification_report(y_test_original, y_pred_original, output_dict=True, zero_division=1)
        report_df = pd.DataFrame(report).transpose()
        st.table(report_df)

def main():

    # Load the dataset
    df = st.session_state.df

    # Create tabs
    tabs = st.tabs(["Dataset", "Naive Bayes", "Decision Tree", "K-Nearest Neighbor"])

    # Iterate through the tabs
    for i, tab in enumerate(tabs):
        with tab:  # Use 'with tab:' here
            if i == 0:  # Dataset tab
                st.header("Dataset Information")
                st.write("## Data Types")
                st.write(df.dtypes)
                st.write("## Statistical Overview")
                st.write(df.describe())
            elif i == 1:  # Naive Bayes tab
                run_model(GaussianNB(), "Naive Bayes")
            elif i == 2:  # Decision Tree tab
                run_model(DecisionTreeClassifier(), "Decision Tree")
            elif i == 3:  # K-Nearest Neighbor tab
                run_model(KNeighborsClassifier(), "K-Nearest Neighbor")

if __name__ == "__main__":
    main()