File size: 7,081 Bytes
1320746 2f7249c 1320746 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
import os
import streamlit as st
from datasets import load_dataset
import pandas as pd
from huggingface_hub import login
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
# Streamlit UI
dataset_name = "louiecerv/unsw-nb15-preprocessed"
# Retrieve Hugging Face token from environment variable
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
st.error("HF_TOKEN environment variable is not set. Please set it before running the app.")
st.stop()
# Login to Hugging Face Hub
login(token=hf_token)
st.title("🛡️📈 Cyber Attack Detection ML Approach")
st.write("This app loads and analyzes the UNSW_NB15_training-set.csv dataset.")
# Display image
image = Image.open("cyber_attack.jpg")
st.image(image, caption="Cybersecurity", use_container_width=True)
about = """
# About This App
This Streamlit app provides an interactive analysis of the UNSW-NB15 dataset, a popular benchmark for evaluating network intrusion detection systems. The app leverages machine learning techniques to classify network traffic as either normal or indicative of various attack types.
## About the UNSW-NB15 Dataset
The UNSW-NB15 dataset was created by the Cyber Security Lab at the University of New South Wales, Canberra. It's a comprehensive dataset containing network traffic captures (tcpdump) and system call traces. The dataset includes a variety of modern attack types, making it a valuable resource for training and testing intrusion detection systems. Key features of the dataset include:
* **Diverse Attack Types:** Covers a wide range of attacks such as Fuzzers, Backdoor, DoS, Exploits, Generic, Reconnaissance, Shellcode, and Worms.
* **Realistic Network Traffic:** Generated using a realistic network environment, simulating real-world scenarios.
* **Labeled Data:** Each network flow is labeled with its corresponding attack type or as normal traffic, enabling supervised learning.
## App Purpose
This app aims to:
1. **Visualize and Explore the Data:** Provide an interface to view the dataset's structure, data types, and descriptive statistics. This allows users to understand the characteristics of the UNSW-NB15 dataset.
2. **Train and Evaluate Machine Learning Models:** Implement and compare the performance of several machine learning classifiers, specifically:
* Naive Bayes
* Decision Tree
* K-Nearest Neighbors
3. **Analyze Model Performance:** Present confusion matrices and classification reports to evaluate the effectiveness of each model in detecting different attack types. This helps users understand the strengths and weaknesses of each algorithm.
4. **Facilitate Learning:** Serve as an educational tool for learning about network intrusion detection, machine learning classification, and dataset analysis.
"""
with st.expander("About this App"):
st.markdown(about)
# Load dataset
try:
with st.spinner("Loading dataset..."):
dataset = load_dataset(dataset_name)
st.success("Dataset loaded successfully.")
except ValueError:
st.error("Dataset not found or incorrect dataset name. Please check the dataset identifier.")
st.stop()
except PermissionError:
st.error("Authentication failed. Check if your Hugging Face token is correct.")
st.stop()
except Exception as e:
st.error(f"Unexpected error: {e}")
st.stop()
df = dataset["train"].to_pandas()
st.write(f"### Train Split")
st.write(f"Shape: {df.shape}")
st.dataframe(df.head())
# Convert mixed-type columns to string or numeric
for col in df.columns:
if df[col].dtype == 'object': # Likely mixed types
df[col] = df[col].astype(str)
elif df[col].dtype in ['float64', 'int64']:
df[col] = pd.to_numeric(df[col], errors='coerce') # Force numeric values
# Replace inf values and NaNs
df.replace([np.inf, -np.inf], np.nan, inplace=True) # Replace infinities with NaN
df.fillna(0, inplace=True) # Replace NaNs with 0 or an appropriate default value
if "df" not in st.session_state:
st.session_state.df = df
def run_model(model, model_name):
with st.spinner(f"Training {model_name}..."):
df = st.session_state.df
st.header(f"{model_name} Classifier")
# Prepare data (assuming 'label' is the target variable)
X = df.drop(columns=['label']) # Another target variable we don't want to include
X = df.drop(columns=['attack_cat']) # Features
y = df['attack_cat'] # Target
# Initialize LabelEncoder
le = LabelEncoder()
# Fit and transform the target variable
y_encoded = le.fit_transform(y)
# Print the encoded labels and mapping for verification (optional)
print("Original labels:", y.values)
print("Encoded labels:", y_encoded)
print("Label mapping:", dict(zip(le.classes_, le.transform(le.classes_))))
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)
# Train the model
model.fit(X_train, y_train)
# Predict
y_pred = model.predict(X_test)
# Reverse the encoding of y_pred
y_pred_original = le.inverse_transform(y_pred)
y_test_original = le.inverse_transform(y_test)
# Show confusion matrix
st.write("## Confusion Matrix")
cm = confusion_matrix(y_test_original, y_pred_original)
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
st.pyplot(fig)
# Show classification report
st.write("## Classification Report")
report = classification_report(y_test_original, y_pred_original, output_dict=True, zero_division=1)
report_df = pd.DataFrame(report).transpose()
st.table(report_df)
def main():
# Load the dataset
df = st.session_state.df
# Create tabs
tabs = st.tabs(["Dataset", "Naive Bayes", "Decision Tree", "K-Nearest Neighbor"])
# Iterate through the tabs
for i, tab in enumerate(tabs):
with tab: # Use 'with tab:' here
if i == 0: # Dataset tab
st.header("Dataset Information")
st.write("## Data Types")
st.write(df.dtypes)
st.write("## Statistical Overview")
st.write(df.describe())
elif i == 1: # Naive Bayes tab
run_model(GaussianNB(), "Naive Bayes")
elif i == 2: # Decision Tree tab
run_model(DecisionTreeClassifier(), "Decision Tree")
elif i == 3: # K-Nearest Neighbor tab
run_model(KNeighborsClassifier(), "K-Nearest Neighbor")
if __name__ == "__main__":
main() |