sync with remote
Browse files- app.py +177 -0
- cyber_attack.jpg +0 -0
- requirements.txt +8 -0
app.py
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import streamlit as st
|
3 |
+
from datasets import load_dataset
|
4 |
+
import pandas as pd
|
5 |
+
from huggingface_hub import login
|
6 |
+
from sklearn.model_selection import train_test_split
|
7 |
+
from sklearn.naive_bayes import GaussianNB
|
8 |
+
from sklearn.tree import DecisionTreeClassifier
|
9 |
+
from sklearn.neighbors import KNeighborsClassifier
|
10 |
+
from sklearn.metrics import confusion_matrix, classification_report
|
11 |
+
from sklearn.preprocessing import LabelEncoder
|
12 |
+
import seaborn as sns
|
13 |
+
import matplotlib.pyplot as plt
|
14 |
+
import numpy as np
|
15 |
+
from PIL import Image
|
16 |
+
|
17 |
+
# Streamlit UI
|
18 |
+
dataset_name = "louiecerv/unsw-nb15-preprocessed"
|
19 |
+
|
20 |
+
# Retrieve Hugging Face token from environment variable
|
21 |
+
hf_token = os.getenv("HF_TOKEN")
|
22 |
+
|
23 |
+
if not hf_token:
|
24 |
+
st.error("HF_TOKEN environment variable is not set. Please set it before running the app.")
|
25 |
+
st.stop()
|
26 |
+
|
27 |
+
# Login to Hugging Face Hub
|
28 |
+
login(token=hf_token)
|
29 |
+
|
30 |
+
st.title("Cyber Attack Detection Dataset ML Analysis")
|
31 |
+
st.write("This app loads and analyzes the UNSW_NB15_training-set.csv dataset.")
|
32 |
+
|
33 |
+
# Display image
|
34 |
+
image = Image.open("cyber_attack.jpg")
|
35 |
+
st.image(image, caption="Cybersecurity", use_container_width=True)
|
36 |
+
|
37 |
+
about = """
|
38 |
+
# About This App
|
39 |
+
|
40 |
+
This Streamlit app provides an interactive analysis of the UNSW-NB15 dataset, a popular benchmark for evaluating network intrusion detection systems. The app leverages machine learning techniques to classify network traffic as either normal or indicative of various attack types.
|
41 |
+
|
42 |
+
## About the UNSW-NB15 Dataset
|
43 |
+
|
44 |
+
The UNSW-NB15 dataset was created by the Cyber Security Lab at the University of New South Wales, Canberra. It's a comprehensive dataset containing network traffic captures (tcpdump) and system call traces. The dataset includes a variety of modern attack types, making it a valuable resource for training and testing intrusion detection systems. Key features of the dataset include:
|
45 |
+
|
46 |
+
* **Diverse Attack Types:** Covers a wide range of attacks such as Fuzzers, Backdoor, DoS, Exploits, Generic, Reconnaissance, Shellcode, and Worms.
|
47 |
+
* **Realistic Network Traffic:** Generated using a realistic network environment, simulating real-world scenarios.
|
48 |
+
* **Labeled Data:** Each network flow is labeled with its corresponding attack type or as normal traffic, enabling supervised learning.
|
49 |
+
|
50 |
+
## App Purpose
|
51 |
+
|
52 |
+
This app aims to:
|
53 |
+
|
54 |
+
1. **Visualize and Explore the Data:** Provide an interface to view the dataset's structure, data types, and descriptive statistics. This allows users to understand the characteristics of the UNSW-NB15 dataset.
|
55 |
+
|
56 |
+
2. **Train and Evaluate Machine Learning Models:** Implement and compare the performance of several machine learning classifiers, specifically:
|
57 |
+
* Naive Bayes
|
58 |
+
* Decision Tree
|
59 |
+
* K-Nearest Neighbors
|
60 |
+
|
61 |
+
3. **Analyze Model Performance:** Present confusion matrices and classification reports to evaluate the effectiveness of each model in detecting different attack types. This helps users understand the strengths and weaknesses of each algorithm.
|
62 |
+
|
63 |
+
4. **Facilitate Learning:** Serve as an educational tool for learning about network intrusion detection, machine learning classification, and dataset analysis.
|
64 |
+
"""
|
65 |
+
with st.expander("About this App"):
|
66 |
+
st.markdown(about)
|
67 |
+
|
68 |
+
# Load dataset
|
69 |
+
try:
|
70 |
+
with st.spinner("Loading dataset..."):
|
71 |
+
dataset = load_dataset(dataset_name)
|
72 |
+
st.success("Dataset loaded successfully.")
|
73 |
+
except ValueError:
|
74 |
+
st.error("Dataset not found or incorrect dataset name. Please check the dataset identifier.")
|
75 |
+
st.stop()
|
76 |
+
except PermissionError:
|
77 |
+
st.error("Authentication failed. Check if your Hugging Face token is correct.")
|
78 |
+
st.stop()
|
79 |
+
except Exception as e:
|
80 |
+
st.error(f"Unexpected error: {e}")
|
81 |
+
st.stop()
|
82 |
+
|
83 |
+
df = dataset["train"].to_pandas()
|
84 |
+
st.write(f"### Train Split")
|
85 |
+
st.write(f"Shape: {df.shape}")
|
86 |
+
st.dataframe(df.head())
|
87 |
+
|
88 |
+
# Convert mixed-type columns to string or numeric
|
89 |
+
for col in df.columns:
|
90 |
+
if df[col].dtype == 'object': # Likely mixed types
|
91 |
+
df[col] = df[col].astype(str)
|
92 |
+
elif df[col].dtype in ['float64', 'int64']:
|
93 |
+
df[col] = pd.to_numeric(df[col], errors='coerce') # Force numeric values
|
94 |
+
|
95 |
+
# Replace inf values and NaNs
|
96 |
+
df.replace([np.inf, -np.inf], np.nan, inplace=True) # Replace infinities with NaN
|
97 |
+
df.fillna(0, inplace=True) # Replace NaNs with 0 or an appropriate default value
|
98 |
+
|
99 |
+
if "df" not in st.session_state:
|
100 |
+
st.session_state.df = df
|
101 |
+
|
102 |
+
def run_model(model, model_name):
|
103 |
+
with st.spinner(f"Training {model_name}..."):
|
104 |
+
df = st.session_state.df
|
105 |
+
st.header(f"{model_name} Classifier")
|
106 |
+
|
107 |
+
# Prepare data (assuming 'label' is the target variable)
|
108 |
+
X = df.drop(columns=['label']) # Another target variable we don't want to include
|
109 |
+
X = df.drop(columns=['attack_cat']) # Features
|
110 |
+
|
111 |
+
y = df['attack_cat'] # Target
|
112 |
+
|
113 |
+
# Initialize LabelEncoder
|
114 |
+
le = LabelEncoder()
|
115 |
+
|
116 |
+
# Fit and transform the target variable
|
117 |
+
y_encoded = le.fit_transform(y)
|
118 |
+
|
119 |
+
# Print the encoded labels and mapping for verification (optional)
|
120 |
+
print("Original labels:", y.values)
|
121 |
+
print("Encoded labels:", y_encoded)
|
122 |
+
print("Label mapping:", dict(zip(le.classes_, le.transform(le.classes_))))
|
123 |
+
|
124 |
+
# Split data
|
125 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)
|
126 |
+
|
127 |
+
# Train the model
|
128 |
+
model.fit(X_train, y_train)
|
129 |
+
|
130 |
+
# Predict
|
131 |
+
y_pred = model.predict(X_test)
|
132 |
+
|
133 |
+
# Reverse the encoding of y_pred
|
134 |
+
y_pred_original = le.inverse_transform(y_pred)
|
135 |
+
y_test_original = le.inverse_transform(y_test)
|
136 |
+
|
137 |
+
# Show confusion matrix
|
138 |
+
st.write("## Confusion Matrix")
|
139 |
+
cm = confusion_matrix(y_test_original, y_pred_original)
|
140 |
+
fig, ax = plt.subplots(figsize=(10, 8))
|
141 |
+
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
|
142 |
+
plt.xlabel('Predicted')
|
143 |
+
plt.ylabel('True')
|
144 |
+
st.pyplot(fig)
|
145 |
+
|
146 |
+
# Show classification report
|
147 |
+
st.write("## Classification Report")
|
148 |
+
report = classification_report(y_test_original, y_pred_original, output_dict=True, zero_division=1)
|
149 |
+
report_df = pd.DataFrame(report).transpose()
|
150 |
+
st.table(report_df)
|
151 |
+
|
152 |
+
def main():
|
153 |
+
|
154 |
+
# Load the dataset
|
155 |
+
df = st.session_state.df
|
156 |
+
|
157 |
+
# Create tabs
|
158 |
+
tabs = st.tabs(["Dataset", "Naive Bayes", "Decision Tree", "K-Nearest Neighbor"])
|
159 |
+
|
160 |
+
# Iterate through the tabs
|
161 |
+
for i, tab in enumerate(tabs):
|
162 |
+
with tab: # Use 'with tab:' here
|
163 |
+
if i == 0: # Dataset tab
|
164 |
+
st.header("Dataset Information")
|
165 |
+
st.write("## Data Types")
|
166 |
+
st.write(df.dtypes)
|
167 |
+
st.write("## Statistical Overview")
|
168 |
+
st.write(df.describe())
|
169 |
+
elif i == 1: # Naive Bayes tab
|
170 |
+
run_model(GaussianNB(), "Naive Bayes")
|
171 |
+
elif i == 2: # Decision Tree tab
|
172 |
+
run_model(DecisionTreeClassifier(), "Decision Tree")
|
173 |
+
elif i == 3: # K-Nearest Neighbor tab
|
174 |
+
run_model(KNeighborsClassifier(), "K-Nearest Neighbor")
|
175 |
+
|
176 |
+
if __name__ == "__main__":
|
177 |
+
main()
|
cyber_attack.jpg
ADDED
![]() |
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
datasets
|
2 |
+
pandas
|
3 |
+
streamlit
|
4 |
+
pandas
|
5 |
+
matplotlib
|
6 |
+
scikit-learn
|
7 |
+
seaborn
|
8 |
+
Pillow
|