louiecerv's picture
updated the readme file
2f7249c
import os
import streamlit as st
from datasets import load_dataset
import pandas as pd
from huggingface_hub import login
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
# Streamlit UI
dataset_name = "louiecerv/unsw-nb15-preprocessed"
# Retrieve Hugging Face token from environment variable
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
st.error("HF_TOKEN environment variable is not set. Please set it before running the app.")
st.stop()
# Login to Hugging Face Hub
login(token=hf_token)
st.title("🛡️📈 Cyber Attack Detection ML Approach")
st.write("This app loads and analyzes the UNSW_NB15_training-set.csv dataset.")
# Display image
image = Image.open("cyber_attack.jpg")
st.image(image, caption="Cybersecurity", use_container_width=True)
about = """
# About This App
This Streamlit app provides an interactive analysis of the UNSW-NB15 dataset, a popular benchmark for evaluating network intrusion detection systems. The app leverages machine learning techniques to classify network traffic as either normal or indicative of various attack types.
## About the UNSW-NB15 Dataset
The UNSW-NB15 dataset was created by the Cyber Security Lab at the University of New South Wales, Canberra. It's a comprehensive dataset containing network traffic captures (tcpdump) and system call traces. The dataset includes a variety of modern attack types, making it a valuable resource for training and testing intrusion detection systems. Key features of the dataset include:
* **Diverse Attack Types:** Covers a wide range of attacks such as Fuzzers, Backdoor, DoS, Exploits, Generic, Reconnaissance, Shellcode, and Worms.
* **Realistic Network Traffic:** Generated using a realistic network environment, simulating real-world scenarios.
* **Labeled Data:** Each network flow is labeled with its corresponding attack type or as normal traffic, enabling supervised learning.
## App Purpose
This app aims to:
1. **Visualize and Explore the Data:** Provide an interface to view the dataset's structure, data types, and descriptive statistics. This allows users to understand the characteristics of the UNSW-NB15 dataset.
2. **Train and Evaluate Machine Learning Models:** Implement and compare the performance of several machine learning classifiers, specifically:
* Naive Bayes
* Decision Tree
* K-Nearest Neighbors
3. **Analyze Model Performance:** Present confusion matrices and classification reports to evaluate the effectiveness of each model in detecting different attack types. This helps users understand the strengths and weaknesses of each algorithm.
4. **Facilitate Learning:** Serve as an educational tool for learning about network intrusion detection, machine learning classification, and dataset analysis.
"""
with st.expander("About this App"):
st.markdown(about)
# Load dataset
try:
with st.spinner("Loading dataset..."):
dataset = load_dataset(dataset_name)
st.success("Dataset loaded successfully.")
except ValueError:
st.error("Dataset not found or incorrect dataset name. Please check the dataset identifier.")
st.stop()
except PermissionError:
st.error("Authentication failed. Check if your Hugging Face token is correct.")
st.stop()
except Exception as e:
st.error(f"Unexpected error: {e}")
st.stop()
df = dataset["train"].to_pandas()
st.write(f"### Train Split")
st.write(f"Shape: {df.shape}")
st.dataframe(df.head())
# Convert mixed-type columns to string or numeric
for col in df.columns:
if df[col].dtype == 'object': # Likely mixed types
df[col] = df[col].astype(str)
elif df[col].dtype in ['float64', 'int64']:
df[col] = pd.to_numeric(df[col], errors='coerce') # Force numeric values
# Replace inf values and NaNs
df.replace([np.inf, -np.inf], np.nan, inplace=True) # Replace infinities with NaN
df.fillna(0, inplace=True) # Replace NaNs with 0 or an appropriate default value
if "df" not in st.session_state:
st.session_state.df = df
def run_model(model, model_name):
with st.spinner(f"Training {model_name}..."):
df = st.session_state.df
st.header(f"{model_name} Classifier")
# Prepare data (assuming 'label' is the target variable)
X = df.drop(columns=['label']) # Another target variable we don't want to include
X = df.drop(columns=['attack_cat']) # Features
y = df['attack_cat'] # Target
# Initialize LabelEncoder
le = LabelEncoder()
# Fit and transform the target variable
y_encoded = le.fit_transform(y)
# Print the encoded labels and mapping for verification (optional)
print("Original labels:", y.values)
print("Encoded labels:", y_encoded)
print("Label mapping:", dict(zip(le.classes_, le.transform(le.classes_))))
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)
# Train the model
model.fit(X_train, y_train)
# Predict
y_pred = model.predict(X_test)
# Reverse the encoding of y_pred
y_pred_original = le.inverse_transform(y_pred)
y_test_original = le.inverse_transform(y_test)
# Show confusion matrix
st.write("## Confusion Matrix")
cm = confusion_matrix(y_test_original, y_pred_original)
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
st.pyplot(fig)
# Show classification report
st.write("## Classification Report")
report = classification_report(y_test_original, y_pred_original, output_dict=True, zero_division=1)
report_df = pd.DataFrame(report).transpose()
st.table(report_df)
def main():
# Load the dataset
df = st.session_state.df
# Create tabs
tabs = st.tabs(["Dataset", "Naive Bayes", "Decision Tree", "K-Nearest Neighbor"])
# Iterate through the tabs
for i, tab in enumerate(tabs):
with tab: # Use 'with tab:' here
if i == 0: # Dataset tab
st.header("Dataset Information")
st.write("## Data Types")
st.write(df.dtypes)
st.write("## Statistical Overview")
st.write(df.describe())
elif i == 1: # Naive Bayes tab
run_model(GaussianNB(), "Naive Bayes")
elif i == 2: # Decision Tree tab
run_model(DecisionTreeClassifier(), "Decision Tree")
elif i == 3: # K-Nearest Neighbor tab
run_model(KNeighborsClassifier(), "K-Nearest Neighbor")
if __name__ == "__main__":
main()