File size: 7,081 Bytes
1320746
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f7249c
1320746
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import os
import streamlit as st
from datasets import load_dataset
import pandas as pd
from huggingface_hub import login
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image

# Streamlit UI
dataset_name = "louiecerv/unsw-nb15-preprocessed"

# Retrieve Hugging Face token from environment variable
hf_token = os.getenv("HF_TOKEN")

if not hf_token:
    st.error("HF_TOKEN environment variable is not set. Please set it before running the app.")
    st.stop()

# Login to Hugging Face Hub
login(token=hf_token)

st.title("🛡️📈 Cyber Attack Detection ML Approach")
st.write("This app loads and analyzes the UNSW_NB15_training-set.csv dataset.")

# Display image
image = Image.open("cyber_attack.jpg")
st.image(image, caption="Cybersecurity", use_container_width=True)

about = """
# About This App

This Streamlit app provides an interactive analysis of the UNSW-NB15 dataset, a popular benchmark for evaluating network intrusion detection systems.  The app leverages machine learning techniques to classify network traffic as either normal or indicative of various attack types.

## About the UNSW-NB15 Dataset

The UNSW-NB15 dataset was created by the Cyber Security Lab at the University of New South Wales, Canberra. It's a comprehensive dataset containing network traffic captures (tcpdump) and system call traces.  The dataset includes a variety of modern attack types, making it a valuable resource for training and testing intrusion detection systems.  Key features of the dataset include:

* **Diverse Attack Types:**  Covers a wide range of attacks such as Fuzzers, Backdoor, DoS, Exploits, Generic, Reconnaissance, Shellcode, and Worms.
* **Realistic Network Traffic:**  Generated using a realistic network environment, simulating real-world scenarios.
* **Labeled Data:**  Each network flow is labeled with its corresponding attack type or as normal traffic, enabling supervised learning.

## App Purpose

This app aims to:

1. **Visualize and Explore the Data:**  Provide an interface to view the dataset's structure, data types, and descriptive statistics.  This allows users to understand the characteristics of the UNSW-NB15 dataset.

2. **Train and Evaluate Machine Learning Models:**  Implement and compare the performance of several machine learning classifiers, specifically:
    * Naive Bayes
    * Decision Tree
    * K-Nearest Neighbors

3. **Analyze Model Performance:**  Present confusion matrices and classification reports to evaluate the effectiveness of each model in detecting different attack types.  This helps users understand the strengths and weaknesses of each algorithm.

4. **Facilitate Learning:**  Serve as an educational tool for learning about network intrusion detection, machine learning classification, and dataset analysis.
"""
with st.expander("About this App"):
    st.markdown(about)

# Load dataset
try:
    with st.spinner("Loading dataset..."):
        dataset = load_dataset(dataset_name)
        st.success("Dataset loaded successfully.")
except ValueError:
    st.error("Dataset not found or incorrect dataset name. Please check the dataset identifier.")
    st.stop()
except PermissionError:
    st.error("Authentication failed. Check if your Hugging Face token is correct.")
    st.stop()
except Exception as e:
    st.error(f"Unexpected error: {e}")
    st.stop()

df = dataset["train"].to_pandas()
st.write(f"### Train Split")
st.write(f"Shape: {df.shape}")
st.dataframe(df.head())

# Convert mixed-type columns to string or numeric
for col in df.columns:
    if df[col].dtype == 'object':  # Likely mixed types
        df[col] = df[col].astype(str)
    elif df[col].dtype in ['float64', 'int64']:
        df[col] = pd.to_numeric(df[col], errors='coerce')  # Force numeric values

# Replace inf values and NaNs
df.replace([np.inf, -np.inf], np.nan, inplace=True)  # Replace infinities with NaN
df.fillna(0, inplace=True)  # Replace NaNs with 0 or an appropriate default value

if "df" not in st.session_state:
    st.session_state.df = df

def run_model(model, model_name):
    with st.spinner(f"Training {model_name}..."):
        df = st.session_state.df
        st.header(f"{model_name} Classifier")

        # Prepare data (assuming 'label' is the target variable)
        X = df.drop(columns=['label'])  # Another target variable we don't want to include
        X = df.drop(columns=['attack_cat'])  # Features

        y = df['attack_cat']  # Target

        # Initialize LabelEncoder
        le = LabelEncoder()

        # Fit and transform the target variable
        y_encoded = le.fit_transform(y)

        # Print the encoded labels and mapping for verification (optional)
        print("Original labels:", y.values)
        print("Encoded labels:", y_encoded)
        print("Label mapping:", dict(zip(le.classes_, le.transform(le.classes_))))

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

        # Train the model
        model.fit(X_train, y_train)

        # Predict
        y_pred = model.predict(X_test)

        # Reverse the encoding of y_pred
        y_pred_original = le.inverse_transform(y_pred)
        y_test_original = le.inverse_transform(y_test)

        # Show confusion matrix
        st.write("## Confusion Matrix")
        cm = confusion_matrix(y_test_original, y_pred_original)
        fig, ax = plt.subplots(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.xlabel('Predicted')
        plt.ylabel('True')
        st.pyplot(fig)

        # Show classification report
        st.write("## Classification Report")
        report = classification_report(y_test_original, y_pred_original, output_dict=True, zero_division=1)
        report_df = pd.DataFrame(report).transpose()
        st.table(report_df)

def main():

    # Load the dataset
    df = st.session_state.df

    # Create tabs
    tabs = st.tabs(["Dataset", "Naive Bayes", "Decision Tree", "K-Nearest Neighbor"])

    # Iterate through the tabs
    for i, tab in enumerate(tabs):
        with tab:  # Use 'with tab:' here
            if i == 0:  # Dataset tab
                st.header("Dataset Information")
                st.write("## Data Types")
                st.write(df.dtypes)
                st.write("## Statistical Overview")
                st.write(df.describe())
            elif i == 1:  # Naive Bayes tab
                run_model(GaussianNB(), "Naive Bayes")
            elif i == 2:  # Decision Tree tab
                run_model(DecisionTreeClassifier(), "Decision Tree")
            elif i == 3:  # K-Nearest Neighbor tab
                run_model(KNeighborsClassifier(), "K-Nearest Neighbor")

if __name__ == "__main__":
    main()