import os import streamlit as st from datasets import load_dataset import pandas as pd from huggingface_hub import login from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import confusion_matrix, classification_report from sklearn.preprocessing import LabelEncoder import seaborn as sns import matplotlib.pyplot as plt import numpy as np from PIL import Image # Streamlit UI dataset_name = "louiecerv/unsw-nb15-preprocessed" # Retrieve Hugging Face token from environment variable hf_token = os.getenv("HF_TOKEN") if not hf_token: st.error("HF_TOKEN environment variable is not set. Please set it before running the app.") st.stop() # Login to Hugging Face Hub login(token=hf_token) st.title("🛡️📈 Cyber Attack Detection ML Approach") st.write("This app loads and analyzes the UNSW_NB15_training-set.csv dataset.") # Display image image = Image.open("cyber_attack.jpg") st.image(image, caption="Cybersecurity", use_container_width=True) about = """ # About This App This Streamlit app provides an interactive analysis of the UNSW-NB15 dataset, a popular benchmark for evaluating network intrusion detection systems. The app leverages machine learning techniques to classify network traffic as either normal or indicative of various attack types. ## About the UNSW-NB15 Dataset The UNSW-NB15 dataset was created by the Cyber Security Lab at the University of New South Wales, Canberra. It's a comprehensive dataset containing network traffic captures (tcpdump) and system call traces. The dataset includes a variety of modern attack types, making it a valuable resource for training and testing intrusion detection systems. Key features of the dataset include: * **Diverse Attack Types:** Covers a wide range of attacks such as Fuzzers, Backdoor, DoS, Exploits, Generic, Reconnaissance, Shellcode, and Worms. * **Realistic Network Traffic:** Generated using a realistic network environment, simulating real-world scenarios. * **Labeled Data:** Each network flow is labeled with its corresponding attack type or as normal traffic, enabling supervised learning. ## App Purpose This app aims to: 1. **Visualize and Explore the Data:** Provide an interface to view the dataset's structure, data types, and descriptive statistics. This allows users to understand the characteristics of the UNSW-NB15 dataset. 2. **Train and Evaluate Machine Learning Models:** Implement and compare the performance of several machine learning classifiers, specifically: * Naive Bayes * Decision Tree * K-Nearest Neighbors 3. **Analyze Model Performance:** Present confusion matrices and classification reports to evaluate the effectiveness of each model in detecting different attack types. This helps users understand the strengths and weaknesses of each algorithm. 4. **Facilitate Learning:** Serve as an educational tool for learning about network intrusion detection, machine learning classification, and dataset analysis. """ with st.expander("About this App"): st.markdown(about) # Load dataset try: with st.spinner("Loading dataset..."): dataset = load_dataset(dataset_name) st.success("Dataset loaded successfully.") except ValueError: st.error("Dataset not found or incorrect dataset name. Please check the dataset identifier.") st.stop() except PermissionError: st.error("Authentication failed. Check if your Hugging Face token is correct.") st.stop() except Exception as e: st.error(f"Unexpected error: {e}") st.stop() df = dataset["train"].to_pandas() st.write(f"### Train Split") st.write(f"Shape: {df.shape}") st.dataframe(df.head()) # Convert mixed-type columns to string or numeric for col in df.columns: if df[col].dtype == 'object': # Likely mixed types df[col] = df[col].astype(str) elif df[col].dtype in ['float64', 'int64']: df[col] = pd.to_numeric(df[col], errors='coerce') # Force numeric values # Replace inf values and NaNs df.replace([np.inf, -np.inf], np.nan, inplace=True) # Replace infinities with NaN df.fillna(0, inplace=True) # Replace NaNs with 0 or an appropriate default value if "df" not in st.session_state: st.session_state.df = df def run_model(model, model_name): with st.spinner(f"Training {model_name}..."): df = st.session_state.df st.header(f"{model_name} Classifier") # Prepare data (assuming 'label' is the target variable) X = df.drop(columns=['label']) # Another target variable we don't want to include X = df.drop(columns=['attack_cat']) # Features y = df['attack_cat'] # Target # Initialize LabelEncoder le = LabelEncoder() # Fit and transform the target variable y_encoded = le.fit_transform(y) # Print the encoded labels and mapping for verification (optional) print("Original labels:", y.values) print("Encoded labels:", y_encoded) print("Label mapping:", dict(zip(le.classes_, le.transform(le.classes_)))) # Split data X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42) # Train the model model.fit(X_train, y_train) # Predict y_pred = model.predict(X_test) # Reverse the encoding of y_pred y_pred_original = le.inverse_transform(y_pred) y_test_original = le.inverse_transform(y_test) # Show confusion matrix st.write("## Confusion Matrix") cm = confusion_matrix(y_test_original, y_pred_original) fig, ax = plt.subplots(figsize=(10, 8)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues') plt.xlabel('Predicted') plt.ylabel('True') st.pyplot(fig) # Show classification report st.write("## Classification Report") report = classification_report(y_test_original, y_pred_original, output_dict=True, zero_division=1) report_df = pd.DataFrame(report).transpose() st.table(report_df) def main(): # Load the dataset df = st.session_state.df # Create tabs tabs = st.tabs(["Dataset", "Naive Bayes", "Decision Tree", "K-Nearest Neighbor"]) # Iterate through the tabs for i, tab in enumerate(tabs): with tab: # Use 'with tab:' here if i == 0: # Dataset tab st.header("Dataset Information") st.write("## Data Types") st.write(df.dtypes) st.write("## Statistical Overview") st.write(df.describe()) elif i == 1: # Naive Bayes tab run_model(GaussianNB(), "Naive Bayes") elif i == 2: # Decision Tree tab run_model(DecisionTreeClassifier(), "Decision Tree") elif i == 3: # K-Nearest Neighbor tab run_model(KNeighborsClassifier(), "K-Nearest Neighbor") if __name__ == "__main__": main()