srinuksv commited on
Commit
bdbc0c3
·
verified ·
1 Parent(s): ee5dc37

Upload link4 (2).py

Browse files
Files changed (1) hide show
  1. link4 (2).py +387 -0
link4 (2).py ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """link4.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1yTE900ZWoLy3vQwKE1Y-Qbm263XCIuN7
8
+ """
9
+
10
+ !pip install selenium
11
+ !pip install webdriver-manager
12
+ !pip install pyshark
13
+ !pip install gradio
14
+ !apt-get update
15
+ !apt-get install -y tshark
16
+ !tshark --version
17
+ !pip install gradio requests scapy joblib pyshark
18
+
19
+ from google.colab import drive
20
+ drive.mount('/content/drive')
21
+
22
+ import pandas as pd
23
+ from sklearn.model_selection import train_test_split, cross_val_score
24
+ from sklearn.ensemble import ExtraTreesClassifier
25
+ from sklearn.metrics import classification_report
26
+ import joblib
27
+ import subprocess
28
+ import time
29
+ from selenium import webdriver
30
+ from selenium.webdriver.chrome.service import Service
31
+ from webdriver_manager.chrome import ChromeDriverManager
32
+ from selenium.webdriver.chrome.options import Options
33
+ import pyshark
34
+ import numpy as np
35
+
36
+ file_paths = [
37
+ '/content/drive/MyDrive/Colab Notebooks/link1/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv',
38
+ '/content/drive/MyDrive/Colab Notebooks/link1/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv',
39
+ '/content/drive/MyDrive/Colab Notebooks/link1/Friday-WorkingHours-Morning.pcap_ISCX.csv',
40
+ '/content/drive/MyDrive/Colab Notebooks/link1/Monday-WorkingHours.pcap_ISCX.csv',
41
+ '/content/drive/MyDrive/Colab Notebooks/link1/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv',
42
+ '/content/drive/MyDrive/Colab Notebooks/link1/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv',
43
+ '/content/drive/MyDrive/Colab Notebooks/link1/Tuesday-WorkingHours.pcap_ISCX.csv',
44
+ '/content/drive/MyDrive/Colab Notebooks/link1/Wednesday-workingHours.pcap_ISCX.csv'
45
+ ]
46
+
47
+ # Combine all files into a single DataFrame
48
+ df = pd.concat([pd.read_csv(file) for file in file_paths], ignore_index=True)
49
+
50
+ # Strip any leading or trailing spaces from column names
51
+ df.columns = df.columns.str.strip()
52
+
53
+ # Print the first few rows and column names to verify
54
+ print("Columns in DataFrame:")
55
+ print(df.columns)
56
+
57
+ # Check if 'Label' exists
58
+ if 'Label' not in df.columns:
59
+ print("Error: 'Label' column not found in the dataset.")
60
+ else:
61
+ # Proceed with mapping the labels to "benign" or "malicious"
62
+ label_mapping = {
63
+ 'BENIGN': 'benign',
64
+ 'DDoS': 'malicious',
65
+ 'PortScan': 'malicious',
66
+ 'Bot': 'malicious',
67
+ 'Infiltration': 'malicious',
68
+ 'Web Attack': 'malicious',
69
+ # Add other malicious classes here if necessary
70
+ }
71
+
72
+ # Map the labels and fill missing values with 'malicious'
73
+ df['Label'] = df['Label'].map(label_mapping).fillna('malicious')
74
+
75
+ # Convert categorical labels to numerical
76
+ df['Label'] = df['Label'].astype('category').cat.codes
77
+
78
+ # Define features and target
79
+ all_features = df.columns.drop('Label')
80
+ features = df[all_features]
81
+ target = df['Label']
82
+
83
+ # Print first few rows of the processed DataFrame
84
+ print(df.head())
85
+ print(df.columns)
86
+ print(f"Features columns: {features.columns}")
87
+ print(f"Target unique values: {target.unique()}")
88
+
89
+ from sklearn.impute import SimpleImputer
90
+ from sklearn.preprocessing import StandardScaler
91
+ print(f"Missing values in features:\n{features.isnull().sum()}")
92
+ print(f"Missing values in target:\n{target.isnull().sum()}")
93
+ print(f"Infinites in features:\n{np.isinf(features).sum()}")
94
+
95
+ # Replace infinite values with NaN
96
+ features.replace([np.inf, -np.inf], np.nan, inplace=True)
97
+
98
+ # Handle missing values: Impute with the mean (for numerical features)
99
+ imputer = SimpleImputer(strategy='mean')
100
+ features_imputed = imputer.fit_transform(features)
101
+
102
+ # Normalize features to handle large values
103
+ scaler = StandardScaler()
104
+ features_scaled = scaler.fit_transform(features_imputed)
105
+
106
+ # Split data into training and testing sets
107
+ X_train, X_test, y_train, y_test = train_test_split(features_imputed, target, test_size=0.3, random_state=42, stratify=target)
108
+
109
+ # Initialize and train the Extra Trees model
110
+ model = ExtraTreesClassifier(n_estimators=100, random_state=42)
111
+ model.fit(X_train, y_train)
112
+
113
+ y_pred = model.predict(X_test)
114
+ print(classification_report(y_test, y_pred))
115
+ from sklearn.metrics import accuracy_score, classification_report
116
+ train_predictions = model.predict(X_train)
117
+ test_predictions = model.predict(X_test)
118
+
119
+ train_accuracy = accuracy_score(y_train, train_predictions)
120
+ test_accuracy = accuracy_score(y_test, test_predictions)
121
+
122
+ print(f"Training Accuracy: {train_accuracy:.4f}")
123
+ print(f"Testing Accuracy: {test_accuracy:.4f}")
124
+
125
+ print("Classification Report (Test Data):")
126
+ print(classification_report(y_test, test_predictions))
127
+
128
+ # Save the model and feature names
129
+ joblib.dump(model, 'extratrees.pkl')
130
+ joblib.dump(all_features.tolist(), 'featurenames.pkl')
131
+ import joblib
132
+
133
+ # Load the model and feature names
134
+ loaded_model = joblib.load('extratrees.pkl')
135
+ loaded_features = joblib.load('featurenames.pkl')
136
+
137
+ # Check if they are loaded successfully
138
+ print(f"Model Loaded: {loaded_model is not None}")
139
+ print(f"Features Loaded: {loaded_features is not None}")
140
+
141
+ # prompt: print different styles and new styles for the classification report\
142
+ import matplotlib.pyplot as plt
143
+ import seaborn as sns
144
+ from sklearn.metrics import classification_report
145
+ def plot_classification_report_styled(y_true, y_pred):
146
+ report = classification_report(y_true, y_pred, output_dict=True)
147
+ df_report = pd.DataFrame(report).transpose()
148
+
149
+ # Style the DataFrame with different colors and formatting
150
+ styled_report = df_report.style.background_gradient(cmap='viridis', axis=None) \
151
+ .highlight_max(color='lightgreen', axis=0) \
152
+ .highlight_min(color='lightcoral', axis=0) \
153
+ .format('{:.2f}')
154
+
155
+ # Display the styled report
156
+ display(styled_report)
157
+
158
+
159
+ # Use the new function to display a styled classification report
160
+ plot_classification_report_styled(y_test, y_pred)
161
+
162
+
163
+ # Alternative Styling using Seaborn and Matplotlib with customization
164
+
165
+ def plot_classification_report_seaborn_styled(y_true, y_pred):
166
+ report = classification_report(y_true, y_pred, output_dict=True)
167
+ df_report = pd.DataFrame(report).transpose()
168
+ plt.figure(figsize=(10, 6))
169
+ sns.heatmap(df_report[['precision', 'recall', 'f1-score']], annot=True, fmt=".2f", cmap="YlGnBu", linewidths=.5, annot_kws={"size": 12})
170
+ plt.title("Classification Report Heatmap", fontsize=16)
171
+ plt.xlabel("Metrics", fontsize=14)
172
+ plt.ylabel("Classes", fontsize=14)
173
+ plt.xticks(fontsize=12)
174
+ plt.yticks(fontsize=12)
175
+ plt.show()
176
+
177
+ plot_classification_report_seaborn_styled(y_test, y_pred)
178
+
179
+ import time
180
+ import subprocess
181
+ import pyshark
182
+ from selenium import webdriver
183
+ from selenium.webdriver.chrome.service import Service
184
+ from webdriver_manager.chrome import ChromeDriverManager
185
+ from selenium.webdriver.chrome.options import Options
186
+ import numpy as np
187
+ import joblib
188
+ import pandas as pd
189
+ import scapy.all as scapy
190
+ import requests
191
+ import gradio as gr
192
+
193
+ # Load the pre-trained model and feature names
194
+ model = joblib.load('/content/drive/MyDrive/Colab Notebooks/link1/extratrees.pkl')
195
+ all_features = joblib.load('/content/drive/MyDrive/Colab Notebooks/link1/featurenames.pkl')
196
+
197
+ # Modify the capture duration to a longer period
198
+ def capture_packets(url, capture_duration=30, capture_file="capture.pcap"):
199
+ try:
200
+ # Start tshark to capture packets
201
+ tshark_process = subprocess.Popen(
202
+ ["tshark", "-i", "any", "-f", "tcp port 80 or tcp port 443 or port 53", "-w", capture_file],
203
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE
204
+ )
205
+ # Wait for tshark to start
206
+ time.sleep(2)
207
+
208
+ # Set up Chrome options
209
+ chrome_options = Options()
210
+ chrome_options.add_argument("--headless") # Run Chrome in headless mode
211
+ chrome_options.add_argument("--no-sandbox")
212
+ chrome_options.add_argument("--disable-dev-shm-usage")
213
+
214
+ # Use Selenium to visit the URL
215
+ service = Service(ChromeDriverManager().install()) # Ensure the driver is installed
216
+ driver = webdriver.Chrome(service=service, options=chrome_options)
217
+ driver.get(url)
218
+
219
+ # Capture packets for the specified duration
220
+ time.sleep(capture_duration)
221
+
222
+ # Close the browser
223
+ driver.quit()
224
+
225
+ # Stop tshark
226
+ tshark_process.terminate()
227
+ tshark_process.wait()
228
+
229
+ # Read captured packets using pyshark for detailed packet information
230
+ packets = []
231
+ cap = pyshark.FileCapture(capture_file)
232
+ for packet in cap:
233
+ packets.append(str(packet))
234
+ cap.close()
235
+ return packets
236
+ except Exception as e:
237
+ print(f"Error in capturing packets: {e}")
238
+ return None
239
+
240
+ # Function to extract features from captured packets
241
+ def extract_features(capture_file):
242
+ try:
243
+ cap = pyshark.FileCapture(capture_file)
244
+
245
+ # Initialize features
246
+ features = {feature: 0 for feature in all_features}
247
+ total_packets = 0
248
+ total_bytes = 0
249
+ start_time = None
250
+ end_time = None
251
+ packet_lengths = []
252
+ protocol_counts = {'TCP': 0, 'UDP': 0, 'ICMP': 0}
253
+ tcp_flags = {'SYN': 0, 'ACK': 0, 'FIN': 0, 'RST': 0}
254
+
255
+ for packet in cap:
256
+ total_packets += 1
257
+ total_bytes += int(packet.length)
258
+ packet_lengths.append(int(packet.length))
259
+ timestamp = float(packet.sniff_time.timestamp())
260
+
261
+ if start_time is None:
262
+ start_time = timestamp
263
+ end_time = timestamp
264
+
265
+ # Counting protocols and flags
266
+ if hasattr(packet, 'tcp'):
267
+ protocol_counts['TCP'] += 1
268
+ if 'SYN' in packet.tcp.flags:
269
+ tcp_flags['SYN'] += 1
270
+ if 'ACK' in packet.tcp.flags:
271
+ tcp_flags['ACK'] += 1
272
+ if 'FIN' in packet.tcp.flags:
273
+ tcp_flags['FIN'] += 1
274
+ if 'RST' in packet.tcp.flags:
275
+ tcp_flags['RST'] += 1
276
+ elif hasattr(packet, 'udp'):
277
+ protocol_counts['UDP'] += 1
278
+ elif hasattr(packet, 'icmp'):
279
+ protocol_counts['ICMP'] += 1
280
+
281
+ duration = end_time - start_time if start_time and end_time else 0
282
+
283
+ # Populate extracted features
284
+ features.update({
285
+ "Flow Duration": duration,
286
+ "Total Packets": total_packets,
287
+ "Total Bytes": total_bytes,
288
+ "Fwd Packet Length Mean": np.mean(packet_lengths) if packet_lengths else 0,
289
+ "Bwd Packet Length Mean": 0, # Assuming no distinction here
290
+ "Flow Bytes/s": total_bytes / duration if duration else 0,
291
+ "Flow Packets/s": total_packets / duration if duration else 0,
292
+ "Average Packet Size": np.mean(packet_lengths) if packet_lengths else 0,
293
+ "Min Packet Size": min(packet_lengths) if packet_lengths else 0,
294
+ "Max Packet Size": max(packet_lengths) if packet_lengths else 0,
295
+ "Packet Length Variance": np.var(packet_lengths) if len(packet_lengths) > 1 else 0,
296
+ "TCP Packets": protocol_counts['TCP'],
297
+ "UDP Packets": protocol_counts['UDP'],
298
+ "ICMP Packets": protocol_counts['ICMP'],
299
+ "TCP SYN Flags": tcp_flags['SYN'],
300
+ "TCP ACK Flags": tcp_flags['ACK'],
301
+ "TCP FIN Flags": tcp_flags['FIN'],
302
+ "TCP RST Flags": tcp_flags['RST']
303
+ })
304
+
305
+ return features
306
+ except Exception as e:
307
+ print(f"Error in extracting features: {e}")
308
+ return None
309
+
310
+ # Function to compare features with CIC-IDS-2017 dataset
311
+ def compare_with_dataset(packet_features):
312
+ # Convert the extracted features into a format that the model can use
313
+ packet_features_series = pd.Series(packet_features)
314
+ packet_features_series = packet_features_series.reindex(all_features, fill_value=0)
315
+ # Predict using the loaded model
316
+ prediction = model.predict([packet_features_series])[0]
317
+ return "benign" if prediction == 0 else "malicious"
318
+
319
+ # Analyze the URL and predict if it's malicious
320
+ def analyze_url(url):
321
+ try:
322
+ # Capture packets using Scapy (updating to capture more specific traffic)
323
+ response = requests.get(url)
324
+ packets = scapy.sniff(count=100) # Capture packets with Scapy
325
+ capture_file = 'capture.pcap'
326
+ scapy.wrpcap(capture_file, packets)
327
+
328
+ # Extract features from the captured packets
329
+ packet_features = extract_features(capture_file)
330
+ if packet_features is not None:
331
+ prediction = compare_with_dataset(packet_features)
332
+
333
+ # Use Pyshark to capture HTTP/HTTPS/DNS packet details
334
+ http_dns_packets = capture_packets(url)
335
+
336
+ captured_packets = [str(packet) for packet in packets]
337
+ return prediction, {"scapy_packets": captured_packets, "http_dns_packets": http_dns_packets}
338
+ else:
339
+ return "Error in feature extraction", []
340
+ except Exception as e:
341
+ return str(e), []
342
+
343
+ # Define the Gradio interface
344
+ iface = gr.Interface(
345
+ fn=analyze_url,
346
+ inputs=gr.Textbox(label="Enter URL"),
347
+ outputs=[gr.Textbox(label="Prediction"), gr.JSON(label="Captured Packets")],
348
+ title="URL Malicious Activity Detection",
349
+ description="Enter a URL to predict if it's malicious or benign by analyzing the network traffic."
350
+ )
351
+
352
+ # Launch the interface
353
+ iface.launch(debug=True)
354
+
355
+ import matplotlib.pyplot as plt
356
+ import numpy as np
357
+
358
+ # Sample data extracted from captured packets
359
+ # These would come from the extracted packet features
360
+ tcp_counts = 20 # Number of TCP packets
361
+ udp_counts = 10 # Number of UDP packets
362
+ packet_sizes = [60, 150, 300, 450, 500, 700, 900, 1100, 1400, 1600] # Example packet sizes in bytes
363
+ timestamps = np.linspace(0, 30, len(packet_sizes)) # Sample timestamps over 30 seconds
364
+
365
+ # Create a new figure
366
+ plt.figure(figsize=(10, 6))
367
+
368
+ # Plot TCP and UDP packet counts in a bar chart
369
+ plt.subplot(2, 1, 1) # 2 rows, 1 column, first plot
370
+ plt.bar(['TCP', 'UDP'], [tcp_counts, udp_counts], color=['blue', 'orange'])
371
+ plt.title('TCP vs UDP Packet Counts')
372
+ plt.xlabel('Protocol')
373
+ plt.ylabel('Packet Count')
374
+
375
+ # Plot packet sizes over time
376
+ plt.subplot(2, 1, 2) # 2 rows, 1 column, second plot
377
+ plt.plot(timestamps, packet_sizes, marker='o', color='green')
378
+ plt.title('Packet Sizes over Time')
379
+ plt.xlabel('Time (s)')
380
+ plt.ylabel('Packet Size (bytes)')
381
+
382
+ # Adjust layout to prevent overlap
383
+ plt.tight_layout()
384
+
385
+ # Display the plots
386
+ plt.show()
387
+