louiecerv commited on
Commit
1320746
·
1 Parent(s): be5af3f

sync with remote

Browse files
Files changed (3) hide show
  1. app.py +177 -0
  2. cyber_attack.jpg +0 -0
  3. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ from datasets import load_dataset
4
+ import pandas as pd
5
+ from huggingface_hub import login
6
+ from sklearn.model_selection import train_test_split
7
+ from sklearn.naive_bayes import GaussianNB
8
+ from sklearn.tree import DecisionTreeClassifier
9
+ from sklearn.neighbors import KNeighborsClassifier
10
+ from sklearn.metrics import confusion_matrix, classification_report
11
+ from sklearn.preprocessing import LabelEncoder
12
+ import seaborn as sns
13
+ import matplotlib.pyplot as plt
14
+ import numpy as np
15
+ from PIL import Image
16
+
17
+ # Streamlit UI
18
+ dataset_name = "louiecerv/unsw-nb15-preprocessed"
19
+
20
+ # Retrieve Hugging Face token from environment variable
21
+ hf_token = os.getenv("HF_TOKEN")
22
+
23
+ if not hf_token:
24
+ st.error("HF_TOKEN environment variable is not set. Please set it before running the app.")
25
+ st.stop()
26
+
27
+ # Login to Hugging Face Hub
28
+ login(token=hf_token)
29
+
30
+ st.title("Cyber Attack Detection Dataset ML Analysis")
31
+ st.write("This app loads and analyzes the UNSW_NB15_training-set.csv dataset.")
32
+
33
+ # Display image
34
+ image = Image.open("cyber_attack.jpg")
35
+ st.image(image, caption="Cybersecurity", use_container_width=True)
36
+
37
+ about = """
38
+ # About This App
39
+
40
+ This Streamlit app provides an interactive analysis of the UNSW-NB15 dataset, a popular benchmark for evaluating network intrusion detection systems. The app leverages machine learning techniques to classify network traffic as either normal or indicative of various attack types.
41
+
42
+ ## About the UNSW-NB15 Dataset
43
+
44
+ The UNSW-NB15 dataset was created by the Cyber Security Lab at the University of New South Wales, Canberra. It's a comprehensive dataset containing network traffic captures (tcpdump) and system call traces. The dataset includes a variety of modern attack types, making it a valuable resource for training and testing intrusion detection systems. Key features of the dataset include:
45
+
46
+ * **Diverse Attack Types:** Covers a wide range of attacks such as Fuzzers, Backdoor, DoS, Exploits, Generic, Reconnaissance, Shellcode, and Worms.
47
+ * **Realistic Network Traffic:** Generated using a realistic network environment, simulating real-world scenarios.
48
+ * **Labeled Data:** Each network flow is labeled with its corresponding attack type or as normal traffic, enabling supervised learning.
49
+
50
+ ## App Purpose
51
+
52
+ This app aims to:
53
+
54
+ 1. **Visualize and Explore the Data:** Provide an interface to view the dataset's structure, data types, and descriptive statistics. This allows users to understand the characteristics of the UNSW-NB15 dataset.
55
+
56
+ 2. **Train and Evaluate Machine Learning Models:** Implement and compare the performance of several machine learning classifiers, specifically:
57
+ * Naive Bayes
58
+ * Decision Tree
59
+ * K-Nearest Neighbors
60
+
61
+ 3. **Analyze Model Performance:** Present confusion matrices and classification reports to evaluate the effectiveness of each model in detecting different attack types. This helps users understand the strengths and weaknesses of each algorithm.
62
+
63
+ 4. **Facilitate Learning:** Serve as an educational tool for learning about network intrusion detection, machine learning classification, and dataset analysis.
64
+ """
65
+ with st.expander("About this App"):
66
+ st.markdown(about)
67
+
68
+ # Load dataset
69
+ try:
70
+ with st.spinner("Loading dataset..."):
71
+ dataset = load_dataset(dataset_name)
72
+ st.success("Dataset loaded successfully.")
73
+ except ValueError:
74
+ st.error("Dataset not found or incorrect dataset name. Please check the dataset identifier.")
75
+ st.stop()
76
+ except PermissionError:
77
+ st.error("Authentication failed. Check if your Hugging Face token is correct.")
78
+ st.stop()
79
+ except Exception as e:
80
+ st.error(f"Unexpected error: {e}")
81
+ st.stop()
82
+
83
+ df = dataset["train"].to_pandas()
84
+ st.write(f"### Train Split")
85
+ st.write(f"Shape: {df.shape}")
86
+ st.dataframe(df.head())
87
+
88
+ # Convert mixed-type columns to string or numeric
89
+ for col in df.columns:
90
+ if df[col].dtype == 'object': # Likely mixed types
91
+ df[col] = df[col].astype(str)
92
+ elif df[col].dtype in ['float64', 'int64']:
93
+ df[col] = pd.to_numeric(df[col], errors='coerce') # Force numeric values
94
+
95
+ # Replace inf values and NaNs
96
+ df.replace([np.inf, -np.inf], np.nan, inplace=True) # Replace infinities with NaN
97
+ df.fillna(0, inplace=True) # Replace NaNs with 0 or an appropriate default value
98
+
99
+ if "df" not in st.session_state:
100
+ st.session_state.df = df
101
+
102
+ def run_model(model, model_name):
103
+ with st.spinner(f"Training {model_name}..."):
104
+ df = st.session_state.df
105
+ st.header(f"{model_name} Classifier")
106
+
107
+ # Prepare data (assuming 'label' is the target variable)
108
+ X = df.drop(columns=['label']) # Another target variable we don't want to include
109
+ X = df.drop(columns=['attack_cat']) # Features
110
+
111
+ y = df['attack_cat'] # Target
112
+
113
+ # Initialize LabelEncoder
114
+ le = LabelEncoder()
115
+
116
+ # Fit and transform the target variable
117
+ y_encoded = le.fit_transform(y)
118
+
119
+ # Print the encoded labels and mapping for verification (optional)
120
+ print("Original labels:", y.values)
121
+ print("Encoded labels:", y_encoded)
122
+ print("Label mapping:", dict(zip(le.classes_, le.transform(le.classes_))))
123
+
124
+ # Split data
125
+ X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)
126
+
127
+ # Train the model
128
+ model.fit(X_train, y_train)
129
+
130
+ # Predict
131
+ y_pred = model.predict(X_test)
132
+
133
+ # Reverse the encoding of y_pred
134
+ y_pred_original = le.inverse_transform(y_pred)
135
+ y_test_original = le.inverse_transform(y_test)
136
+
137
+ # Show confusion matrix
138
+ st.write("## Confusion Matrix")
139
+ cm = confusion_matrix(y_test_original, y_pred_original)
140
+ fig, ax = plt.subplots(figsize=(10, 8))
141
+ sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
142
+ plt.xlabel('Predicted')
143
+ plt.ylabel('True')
144
+ st.pyplot(fig)
145
+
146
+ # Show classification report
147
+ st.write("## Classification Report")
148
+ report = classification_report(y_test_original, y_pred_original, output_dict=True, zero_division=1)
149
+ report_df = pd.DataFrame(report).transpose()
150
+ st.table(report_df)
151
+
152
+ def main():
153
+
154
+ # Load the dataset
155
+ df = st.session_state.df
156
+
157
+ # Create tabs
158
+ tabs = st.tabs(["Dataset", "Naive Bayes", "Decision Tree", "K-Nearest Neighbor"])
159
+
160
+ # Iterate through the tabs
161
+ for i, tab in enumerate(tabs):
162
+ with tab: # Use 'with tab:' here
163
+ if i == 0: # Dataset tab
164
+ st.header("Dataset Information")
165
+ st.write("## Data Types")
166
+ st.write(df.dtypes)
167
+ st.write("## Statistical Overview")
168
+ st.write(df.describe())
169
+ elif i == 1: # Naive Bayes tab
170
+ run_model(GaussianNB(), "Naive Bayes")
171
+ elif i == 2: # Decision Tree tab
172
+ run_model(DecisionTreeClassifier(), "Decision Tree")
173
+ elif i == 3: # K-Nearest Neighbor tab
174
+ run_model(KNeighborsClassifier(), "K-Nearest Neighbor")
175
+
176
+ if __name__ == "__main__":
177
+ main()
cyber_attack.jpg ADDED
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ datasets
2
+ pandas
3
+ streamlit
4
+ pandas
5
+ matplotlib
6
+ scikit-learn
7
+ seaborn
8
+ Pillow