Spaces:

louiecerv
/

cyberattack_detection_ml_approach

Sleeping

App Files Files Community

louiecerv commited on Feb 18

Commit

1320746

1 Parent(s): be5af3f

sync with remote

Browse files

Files changed (3) hide show

app.py +177 -0
cyber_attack.jpg +0 -0
requirements.txt +8 -0

app.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import os
+import streamlit as st
+from datasets import load_dataset
+import pandas as pd
+from huggingface_hub import login
+from sklearn.model_selection import train_test_split
+from sklearn.naive_bayes import GaussianNB
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.metrics import confusion_matrix, classification_report
+from sklearn.preprocessing import LabelEncoder
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+from PIL import Image
+# Streamlit UI
+dataset_name = "louiecerv/unsw-nb15-preprocessed"
+# Retrieve Hugging Face token from environment variable
+hf_token = os.getenv("HF_TOKEN")
+if not hf_token:
+    st.error("HF_TOKEN environment variable is not set. Please set it before running the app.")
+    st.stop()
+# Login to Hugging Face Hub
+login(token=hf_token)
+st.title("Cyber Attack Detection Dataset ML Analysis")
+st.write("This app loads and analyzes the UNSW_NB15_training-set.csv dataset.")
+# Display image
+image = Image.open("cyber_attack.jpg")
+st.image(image, caption="Cybersecurity", use_container_width=True)
+about = """
+# About This App
+This Streamlit app provides an interactive analysis of the UNSW-NB15 dataset, a popular benchmark for evaluating network intrusion detection systems.  The app leverages machine learning techniques to classify network traffic as either normal or indicative of various attack types.
+## About the UNSW-NB15 Dataset
+The UNSW-NB15 dataset was created by the Cyber Security Lab at the University of New South Wales, Canberra. It's a comprehensive dataset containing network traffic captures (tcpdump) and system call traces.  The dataset includes a variety of modern attack types, making it a valuable resource for training and testing intrusion detection systems.  Key features of the dataset include:
+* **Diverse Attack Types:**  Covers a wide range of attacks such as Fuzzers, Backdoor, DoS, Exploits, Generic, Reconnaissance, Shellcode, and Worms.
+* **Realistic Network Traffic:**  Generated using a realistic network environment, simulating real-world scenarios.
+* **Labeled Data:**  Each network flow is labeled with its corresponding attack type or as normal traffic, enabling supervised learning.
+## App Purpose
+This app aims to:
+1. **Visualize and Explore the Data:**  Provide an interface to view the dataset's structure, data types, and descriptive statistics.  This allows users to understand the characteristics of the UNSW-NB15 dataset.
+2. **Train and Evaluate Machine Learning Models:**  Implement and compare the performance of several machine learning classifiers, specifically:
+    * Naive Bayes
+    * Decision Tree
+    * K-Nearest Neighbors
+3. **Analyze Model Performance:**  Present confusion matrices and classification reports to evaluate the effectiveness of each model in detecting different attack types.  This helps users understand the strengths and weaknesses of each algorithm.
+4. **Facilitate Learning:**  Serve as an educational tool for learning about network intrusion detection, machine learning classification, and dataset analysis.
+"""
+with st.expander("About this App"):
+    st.markdown(about)
+# Load dataset
+try:
+    with st.spinner("Loading dataset..."):
+        dataset = load_dataset(dataset_name)
+        st.success("Dataset loaded successfully.")
+except ValueError:
+    st.error("Dataset not found or incorrect dataset name. Please check the dataset identifier.")
+    st.stop()
+except PermissionError:
+    st.error("Authentication failed. Check if your Hugging Face token is correct.")
+    st.stop()
+except Exception as e:
+    st.error(f"Unexpected error: {e}")
+    st.stop()
+df = dataset["train"].to_pandas()
+st.write(f"### Train Split")
+st.write(f"Shape: {df.shape}")
+st.dataframe(df.head())
+# Convert mixed-type columns to string or numeric
+for col in df.columns:
+    if df[col].dtype == 'object':  # Likely mixed types
+        df[col] = df[col].astype(str)
+    elif df[col].dtype in ['float64', 'int64']:
+        df[col] = pd.to_numeric(df[col], errors='coerce')  # Force numeric values
+# Replace inf values and NaNs
+df.replace([np.inf, -np.inf], np.nan, inplace=True)  # Replace infinities with NaN
+df.fillna(0, inplace=True)  # Replace NaNs with 0 or an appropriate default value
+if "df" not in st.session_state:
+    st.session_state.df = df
+def run_model(model, model_name):
+    with st.spinner(f"Training {model_name}..."):
+        df = st.session_state.df
+        st.header(f"{model_name} Classifier")
+        # Prepare data (assuming 'label' is the target variable)
+        X = df.drop(columns=['label'])  # Another target variable we don't want to include
+        X = df.drop(columns=['attack_cat'])  # Features
+        y = df['attack_cat']  # Target
+        # Initialize LabelEncoder
+        le = LabelEncoder()
+        # Fit and transform the target variable
+        y_encoded = le.fit_transform(y)
+        # Print the encoded labels and mapping for verification (optional)
+        print("Original labels:", y.values)
+        print("Encoded labels:", y_encoded)
+        print("Label mapping:", dict(zip(le.classes_, le.transform(le.classes_))))
+        # Split data
+        X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)
+        # Train the model
+        model.fit(X_train, y_train)
+        # Predict
+        y_pred = model.predict(X_test)
+        # Reverse the encoding of y_pred
+        y_pred_original = le.inverse_transform(y_pred)
+        y_test_original = le.inverse_transform(y_test)
+        # Show confusion matrix
+        st.write("## Confusion Matrix")
+        cm = confusion_matrix(y_test_original, y_pred_original)
+        fig, ax = plt.subplots(figsize=(10, 8))
+        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
+        plt.xlabel('Predicted')
+        plt.ylabel('True')
+        st.pyplot(fig)
+        # Show classification report
+        st.write("## Classification Report")
+        report = classification_report(y_test_original, y_pred_original, output_dict=True, zero_division=1)
+        report_df = pd.DataFrame(report).transpose()
+        st.table(report_df)
+def main():
+    # Load the dataset
+    df = st.session_state.df
+    # Create tabs
+    tabs = st.tabs(["Dataset", "Naive Bayes", "Decision Tree", "K-Nearest Neighbor"])
+    # Iterate through the tabs
+    for i, tab in enumerate(tabs):
+        with tab:  # Use 'with tab:' here
+            if i == 0:  # Dataset tab
+                st.header("Dataset Information")
+                st.write("## Data Types")
+                st.write(df.dtypes)
+                st.write("## Statistical Overview")
+                st.write(df.describe())
+            elif i == 1:  # Naive Bayes tab
+                run_model(GaussianNB(), "Naive Bayes")
+            elif i == 2:  # Decision Tree tab
+                run_model(DecisionTreeClassifier(), "Decision Tree")
+            elif i == 3:  # K-Nearest Neighbor tab
+                run_model(KNeighborsClassifier(), "K-Nearest Neighbor")
+if __name__ == "__main__":
+    main()

cyber_attack.jpg ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+datasets
+pandas
+streamlit
+pandas
+matplotlib
+scikit-learn
+seaborn
+Pillow