Spaces:

srinuksv
/

project-final

Sleeping

App Files Files Community

srinuksv commited on Nov 6, 2024

Commit

bdbc0c3

verified ·

1 Parent(s): ee5dc37

Upload link4 (2).py

Browse files

Files changed (1) hide show

link4 (2).py +387 -0

link4 (2).py ADDED Viewed

	@@ -0,0 +1,387 @@

+# -*- coding: utf-8 -*-
+"""link4.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1yTE900ZWoLy3vQwKE1Y-Qbm263XCIuN7
+"""
+!pip install selenium
+!pip install webdriver-manager
+!pip install pyshark
+!pip install gradio
+!apt-get update
+!apt-get install -y tshark
+!tshark --version
+!pip install gradio requests scapy joblib pyshark
+from google.colab import drive
+drive.mount('/content/drive')
+import pandas as pd
+from sklearn.model_selection import train_test_split, cross_val_score
+from sklearn.ensemble import ExtraTreesClassifier
+from sklearn.metrics import classification_report
+import joblib
+import subprocess
+import time
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from webdriver_manager.chrome import ChromeDriverManager
+from selenium.webdriver.chrome.options import Options
+import pyshark
+import numpy as np
+file_paths = [
+    '/content/drive/MyDrive/Colab Notebooks/link1/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv',
+    '/content/drive/MyDrive/Colab Notebooks/link1/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv',
+    '/content/drive/MyDrive/Colab Notebooks/link1/Friday-WorkingHours-Morning.pcap_ISCX.csv',
+    '/content/drive/MyDrive/Colab Notebooks/link1/Monday-WorkingHours.pcap_ISCX.csv',
+    '/content/drive/MyDrive/Colab Notebooks/link1/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv',
+    '/content/drive/MyDrive/Colab Notebooks/link1/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv',
+    '/content/drive/MyDrive/Colab Notebooks/link1/Tuesday-WorkingHours.pcap_ISCX.csv',
+    '/content/drive/MyDrive/Colab Notebooks/link1/Wednesday-workingHours.pcap_ISCX.csv'
+]
+# Combine all files into a single DataFrame
+df = pd.concat([pd.read_csv(file) for file in file_paths], ignore_index=True)
+# Strip any leading or trailing spaces from column names
+df.columns = df.columns.str.strip()
+# Print the first few rows and column names to verify
+print("Columns in DataFrame:")
+print(df.columns)
+# Check if 'Label' exists
+if 'Label' not in df.columns:
+    print("Error: 'Label' column not found in the dataset.")
+else:
+    # Proceed with mapping the labels to "benign" or "malicious"
+    label_mapping = {
+        'BENIGN': 'benign',
+        'DDoS': 'malicious',
+        'PortScan': 'malicious',
+        'Bot': 'malicious',
+        'Infiltration': 'malicious',
+        'Web Attack': 'malicious',
+        # Add other malicious classes here if necessary
+    }
+    # Map the labels and fill missing values with 'malicious'
+    df['Label'] = df['Label'].map(label_mapping).fillna('malicious')
+    # Convert categorical labels to numerical
+    df['Label'] = df['Label'].astype('category').cat.codes
+    # Define features and target
+    all_features = df.columns.drop('Label')
+    features = df[all_features]
+    target = df['Label']
+    # Print first few rows of the processed DataFrame
+    print(df.head())
+    print(df.columns)
+    print(f"Features columns: {features.columns}")
+print(f"Target unique values: {target.unique()}")
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import StandardScaler
+print(f"Missing values in features:\n{features.isnull().sum()}")
+print(f"Missing values in target:\n{target.isnull().sum()}")
+print(f"Infinites in features:\n{np.isinf(features).sum()}")
+# Replace infinite values with NaN
+features.replace([np.inf, -np.inf], np.nan, inplace=True)
+# Handle missing values: Impute with the mean (for numerical features)
+imputer = SimpleImputer(strategy='mean')
+features_imputed = imputer.fit_transform(features)
+# Normalize features to handle large values
+scaler = StandardScaler()
+features_scaled = scaler.fit_transform(features_imputed)
+# Split data into training and testing sets
+X_train, X_test, y_train, y_test = train_test_split(features_imputed, target, test_size=0.3, random_state=42, stratify=target)
+# Initialize and train the Extra Trees model
+model = ExtraTreesClassifier(n_estimators=100, random_state=42)
+model.fit(X_train, y_train)
+y_pred = model.predict(X_test)
+print(classification_report(y_test, y_pred))
+from sklearn.metrics import accuracy_score, classification_report
+train_predictions = model.predict(X_train)
+test_predictions = model.predict(X_test)
+train_accuracy = accuracy_score(y_train, train_predictions)
+test_accuracy = accuracy_score(y_test, test_predictions)
+print(f"Training Accuracy: {train_accuracy:.4f}")
+print(f"Testing Accuracy: {test_accuracy:.4f}")
+print("Classification Report (Test Data):")
+print(classification_report(y_test, test_predictions))
+# Save the model and feature names
+joblib.dump(model, 'extratrees.pkl')
+joblib.dump(all_features.tolist(), 'featurenames.pkl')
+import joblib
+# Load the model and feature names
+loaded_model = joblib.load('extratrees.pkl')
+loaded_features = joblib.load('featurenames.pkl')
+# Check if they are loaded successfully
+print(f"Model Loaded: {loaded_model is not None}")
+print(f"Features Loaded: {loaded_features is not None}")
+# prompt: print different styles and new styles for the classification report\
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.metrics import classification_report
+def plot_classification_report_styled(y_true, y_pred):
+  report = classification_report(y_true, y_pred, output_dict=True)
+  df_report = pd.DataFrame(report).transpose()
+  # Style the DataFrame with different colors and formatting
+  styled_report = df_report.style.background_gradient(cmap='viridis', axis=None) \
+    .highlight_max(color='lightgreen', axis=0) \
+    .highlight_min(color='lightcoral', axis=0) \
+    .format('{:.2f}')
+  # Display the styled report
+  display(styled_report)
+# Use the new function to display a styled classification report
+plot_classification_report_styled(y_test, y_pred)
+# Alternative Styling using Seaborn and Matplotlib with customization
+def plot_classification_report_seaborn_styled(y_true, y_pred):
+  report = classification_report(y_true, y_pred, output_dict=True)
+  df_report = pd.DataFrame(report).transpose()
+  plt.figure(figsize=(10, 6))
+  sns.heatmap(df_report[['precision', 'recall', 'f1-score']], annot=True, fmt=".2f", cmap="YlGnBu", linewidths=.5, annot_kws={"size": 12})
+  plt.title("Classification Report Heatmap", fontsize=16)
+  plt.xlabel("Metrics", fontsize=14)
+  plt.ylabel("Classes", fontsize=14)
+  plt.xticks(fontsize=12)
+  plt.yticks(fontsize=12)
+  plt.show()
+plot_classification_report_seaborn_styled(y_test, y_pred)
+import time
+import subprocess
+import pyshark
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from webdriver_manager.chrome import ChromeDriverManager
+from selenium.webdriver.chrome.options import Options
+import numpy as np
+import joblib
+import pandas as pd
+import scapy.all as scapy
+import requests
+import gradio as gr
+# Load the pre-trained model and feature names
+model = joblib.load('/content/drive/MyDrive/Colab Notebooks/link1/extratrees.pkl')
+all_features = joblib.load('/content/drive/MyDrive/Colab Notebooks/link1/featurenames.pkl')
+# Modify the capture duration to a longer period
+def capture_packets(url, capture_duration=30, capture_file="capture.pcap"):
+    try:
+        # Start tshark to capture packets
+        tshark_process = subprocess.Popen(
+            ["tshark", "-i", "any", "-f", "tcp port 80 or tcp port 443 or port 53", "-w", capture_file],
+            stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        # Wait for tshark to start
+        time.sleep(2)
+        # Set up Chrome options
+        chrome_options = Options()
+        chrome_options.add_argument("--headless")  # Run Chrome in headless mode
+        chrome_options.add_argument("--no-sandbox")
+        chrome_options.add_argument("--disable-dev-shm-usage")
+        # Use Selenium to visit the URL
+        service = Service(ChromeDriverManager().install())  # Ensure the driver is installed
+        driver = webdriver.Chrome(service=service, options=chrome_options)
+        driver.get(url)
+        # Capture packets for the specified duration
+        time.sleep(capture_duration)
+        # Close the browser
+        driver.quit()
+        # Stop tshark
+        tshark_process.terminate()
+        tshark_process.wait()
+        # Read captured packets using pyshark for detailed packet information
+        packets = []
+        cap = pyshark.FileCapture(capture_file)
+        for packet in cap:
+            packets.append(str(packet))
+        cap.close()
+        return packets
+    except Exception as e:
+        print(f"Error in capturing packets: {e}")
+        return None
+# Function to extract features from captured packets
+def extract_features(capture_file):
+    try:
+        cap = pyshark.FileCapture(capture_file)
+        # Initialize features
+        features = {feature: 0 for feature in all_features}
+        total_packets = 0
+        total_bytes = 0
+        start_time = None
+        end_time = None
+        packet_lengths = []
+        protocol_counts = {'TCP': 0, 'UDP': 0, 'ICMP': 0}
+        tcp_flags = {'SYN': 0, 'ACK': 0, 'FIN': 0, 'RST': 0}
+        for packet in cap:
+            total_packets += 1
+            total_bytes += int(packet.length)
+            packet_lengths.append(int(packet.length))
+            timestamp = float(packet.sniff_time.timestamp())
+            if start_time is None:
+                start_time = timestamp
+            end_time = timestamp
+            # Counting protocols and flags
+            if hasattr(packet, 'tcp'):
+                protocol_counts['TCP'] += 1
+                if 'SYN' in packet.tcp.flags:
+                    tcp_flags['SYN'] += 1
+                if 'ACK' in packet.tcp.flags:
+                    tcp_flags['ACK'] += 1
+                if 'FIN' in packet.tcp.flags:
+                    tcp_flags['FIN'] += 1
+                if 'RST' in packet.tcp.flags:
+                    tcp_flags['RST'] += 1
+            elif hasattr(packet, 'udp'):
+                protocol_counts['UDP'] += 1
+            elif hasattr(packet, 'icmp'):
+                protocol_counts['ICMP'] += 1
+        duration = end_time - start_time if start_time and end_time else 0
+        # Populate extracted features
+        features.update({
+            "Flow Duration": duration,
+            "Total Packets": total_packets,
+            "Total Bytes": total_bytes,
+            "Fwd Packet Length Mean": np.mean(packet_lengths) if packet_lengths else 0,
+            "Bwd Packet Length Mean": 0,  # Assuming no distinction here
+            "Flow Bytes/s": total_bytes / duration if duration else 0,
+            "Flow Packets/s": total_packets / duration if duration else 0,
+            "Average Packet Size": np.mean(packet_lengths) if packet_lengths else 0,
+            "Min Packet Size": min(packet_lengths) if packet_lengths else 0,
+            "Max Packet Size": max(packet_lengths) if packet_lengths else 0,
+            "Packet Length Variance": np.var(packet_lengths) if len(packet_lengths) > 1 else 0,
+            "TCP Packets": protocol_counts['TCP'],
+            "UDP Packets": protocol_counts['UDP'],
+            "ICMP Packets": protocol_counts['ICMP'],
+            "TCP SYN Flags": tcp_flags['SYN'],
+            "TCP ACK Flags": tcp_flags['ACK'],
+            "TCP FIN Flags": tcp_flags['FIN'],
+            "TCP RST Flags": tcp_flags['RST']
+        })
+        return features
+    except Exception as e:
+        print(f"Error in extracting features: {e}")
+        return None
+# Function to compare features with CIC-IDS-2017 dataset
+def compare_with_dataset(packet_features):
+    # Convert the extracted features into a format that the model can use
+    packet_features_series = pd.Series(packet_features)
+    packet_features_series = packet_features_series.reindex(all_features, fill_value=0)
+    # Predict using the loaded model
+    prediction = model.predict([packet_features_series])[0]
+    return "benign" if prediction == 0 else "malicious"
+# Analyze the URL and predict if it's malicious
+def analyze_url(url):
+    try:
+        # Capture packets using Scapy (updating to capture more specific traffic)
+        response = requests.get(url)
+        packets = scapy.sniff(count=100)  # Capture packets with Scapy
+        capture_file = 'capture.pcap'
+        scapy.wrpcap(capture_file, packets)
+        # Extract features from the captured packets
+        packet_features = extract_features(capture_file)
+        if packet_features is not None:
+            prediction = compare_with_dataset(packet_features)
+            # Use Pyshark to capture HTTP/HTTPS/DNS packet details
+            http_dns_packets = capture_packets(url)
+            captured_packets = [str(packet) for packet in packets]
+            return prediction, {"scapy_packets": captured_packets, "http_dns_packets": http_dns_packets}
+        else:
+            return "Error in feature extraction", []
+    except Exception as e:
+        return str(e), []
+# Define the Gradio interface
+iface = gr.Interface(
+    fn=analyze_url,
+    inputs=gr.Textbox(label="Enter URL"),
+    outputs=[gr.Textbox(label="Prediction"), gr.JSON(label="Captured Packets")],
+    title="URL Malicious Activity Detection",
+    description="Enter a URL to predict if it's malicious or benign by analyzing the network traffic."
+)
+# Launch the interface
+iface.launch(debug=True)
+import matplotlib.pyplot as plt
+import numpy as np
+# Sample data extracted from captured packets
+# These would come from the extracted packet features
+tcp_counts = 20  # Number of TCP packets
+udp_counts = 10  # Number of UDP packets
+packet_sizes = [60, 150, 300, 450, 500, 700, 900, 1100, 1400, 1600]  # Example packet sizes in bytes
+timestamps = np.linspace(0, 30, len(packet_sizes))  # Sample timestamps over 30 seconds
+# Create a new figure
+plt.figure(figsize=(10, 6))
+# Plot TCP and UDP packet counts in a bar chart
+plt.subplot(2, 1, 1)  # 2 rows, 1 column, first plot
+plt.bar(['TCP', 'UDP'], [tcp_counts, udp_counts], color=['blue', 'orange'])
+plt.title('TCP vs UDP Packet Counts')
+plt.xlabel('Protocol')
+plt.ylabel('Packet Count')
+# Plot packet sizes over time
+plt.subplot(2, 1, 2)  # 2 rows, 1 column, second plot
+plt.plot(timestamps, packet_sizes, marker='o', color='green')
+plt.title('Packet Sizes over Time')
+plt.xlabel('Time (s)')
+plt.ylabel('Packet Size (bytes)')
+# Adjust layout to prevent overlap
+plt.tight_layout()
+# Display the plots
+plt.show()