CoderCowMoo's picture
Upload 3 files
09c1340 verified
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd
import numpy as np
import json
import os
import re
import urllib.parse
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tqdm
# --- Healthcare URL Detection Components ---
# Healthcare-related keywords for domain detection
HEALTHCARE_KEYWORDS = [
'health', 'medical', 'hospital', 'clinic', 'pharma', 'patient', 'care', 'med',
'doctor', 'physician', 'nurse', 'therapy', 'rehab', 'dental', 'cardio', 'neuro',
'oncology', 'pediatric', 'orthopedic', 'surgery', 'diagnostic', 'wellbeing',
'wellness', 'ehr', 'emr', 'mychart', 'medicare', 'medicaid', 'insurance'
]
# Common healthcare institutions and systems
HEALTHCARE_INSTITUTIONS = [
'mayo', 'cleveland', 'hopkins', 'kaiser', 'mount sinai', 'cedars', 'baylor',
'nhs', 'quest', 'labcorp', 'cvs', 'walgreens', 'aetna', 'cigna', 'unitedhealthcare',
'bluecross', 'anthem', 'humana', 'va.gov', 'cdc', 'who', 'nih'
]
# Healthcare TLDs and specific domains
HEALTHCARE_DOMAINS = ['.health', '.healthcare', '.medicine', '.hospital', '.clinic', 'mychart.']
# --- Feature Extraction Functions ---
def url_length(url):
"""Return the length of the URL."""
return len(url)
def num_dots(url):
"""Return the number of dots in the URL."""
return url.count('.')
def num_hyphens(url):
"""Return the number of hyphens in the URL."""
return url.count('-')
def num_at(url):
"""Return the number of @ symbols in the URL."""
return url.count('@')
def num_tilde(url):
"""Return the number of ~ symbols in the URL."""
return url.count('~')
def num_underscore(url):
"""Return the number of underscores in the URL."""
return url.count('_')
def num_percent(url):
"""Return the number of percent symbols in the URL."""
return url.count('%')
def num_ampersand(url):
"""Return the number of ampersands in the URL."""
return url.count('&')
def num_hash(url):
"""Return the number of hash symbols in the URL."""
return url.count('#')
def has_https(url):
"""Return 1 if the URL uses HTTPS, 0 otherwise."""
return int(url.startswith('https://'))
def has_ip_address(url):
"""Check if the URL contains an IP address instead of a domain name."""
try:
parsed_url = urllib.parse.urlparse(url)
if re.match(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$', parsed_url.netloc):
return 1
# Check for IPv6
if re.match(r'^\[[0-9a-fA-F:]+\]$', parsed_url.netloc):
return 1
return 0
except:
return 0
def get_hostname_length(url):
"""Return the length of the hostname."""
try:
parsed_url = urllib.parse.urlparse(url)
return len(parsed_url.netloc)
except:
return 0
def get_path_length(url):
"""Return the length of the path."""
try:
parsed_url = urllib.parse.urlparse(url)
return len(parsed_url.path)
except:
return 0
def get_path_level(url):
"""Return the number of directories in the path."""
try:
parsed_url = urllib.parse.urlparse(url)
return parsed_url.path.count('/')
except:
return 0
def get_subdomain_level(url):
"""Return the number of subdomains in the URL."""
try:
parsed_url = urllib.parse.urlparse(url)
hostname = parsed_url.netloc
if has_ip_address(url):
return 0 # IP addresses don't have subdomains
parts = hostname.split('.')
# Remove top-level and second-level domains
if len(parts) > 2:
return len(parts) - 2 # Count remaining parts as subdomain levels
else:
return 0 # No subdomains
except:
return 0
def has_double_slash_in_path(url):
"""Check if the path contains a double slash."""
try:
parsed_url = urllib.parse.urlparse(url)
return int('//' in parsed_url.path)
except:
return 0
def get_tld(url):
"""Extract the top-level domain from a URL."""
try:
parsed_url = urllib.parse.urlparse(url)
hostname = parsed_url.netloc.lower()
parts = hostname.split('.')
if len(parts) > 1:
return parts[-1]
return ''
except:
return ''
def count_digits(url):
"""Count the number of digits in the URL."""
return sum(c.isdigit() for c in url)
def digit_ratio(url):
"""Calculate the ratio of digits to the total URL length."""
if len(url) == 0:
return 0
return count_digits(url) / len(url)
def count_letters(url):
"""Count the number of letters in the URL."""
return sum(c.isalpha() for c in url)
def letter_ratio(url):
"""Calculate the ratio of letters to the total URL length."""
if len(url) == 0:
return 0
return count_letters(url) / len(url)
def count_special_chars(url):
"""Count the number of special characters in the URL."""
return sum(not c.isalnum() and not c.isspace() for c in url)
def special_char_ratio(url):
"""Calculate the ratio of special characters to the total URL length."""
if len(url) == 0:
return 0
return count_special_chars(url) / len(url)
def get_query_length(url):
"""Return the length of the query string."""
try:
parsed_url = urllib.parse.urlparse(url)
return len(parsed_url.query)
except:
return 0
def get_fragment_length(url):
"""Return the length of the fragment."""
try:
parsed_url = urllib.parse.urlparse(url)
return len(parsed_url.fragment)
except:
return 0
def healthcare_relevance_score(url):
"""
Calculate a relevance score for healthcare-related URLs.
Higher scores indicate stronger relation to healthcare.
"""
url_lower = url.lower()
parsed_url = urllib.parse.urlparse(url_lower)
domain = parsed_url.netloc
path = parsed_url.path
score = 0
# Check for healthcare keywords in domain
for keyword in HEALTHCARE_KEYWORDS:
if keyword in domain:
score += 3
elif keyword in path:
score += 1
# Check for healthcare institutions
for institution in HEALTHCARE_INSTITUTIONS:
if institution in domain:
score += 4
elif institution in path:
score += 2
# Check for healthcare-specific domains and TLDs
for healthcare_domain in HEALTHCARE_DOMAINS:
if healthcare_domain in domain:
score += 3
# Check for EHR/patient portal indicators
if 'portal' in domain or 'portal' in path:
score += 2
if 'patient' in domain or 'mychart' in domain:
score += 3
if 'ehr' in domain or 'emr' in domain:
score += 3
# Normalize score to be between 0 and 1
return min(score / 10.0, 1.0)
def extract_features(url):
"""Extract all features from a given URL."""
features = [
# Core features (the original 17)
num_dots(url),
get_subdomain_level(url),
get_path_level(url),
url_length(url),
num_hyphens(url),
num_at(url),
num_tilde(url),
num_underscore(url),
num_percent(url),
num_ampersand(url),
num_hash(url),
has_https(url),
has_ip_address(url),
get_hostname_length(url),
get_path_length(url),
has_double_slash_in_path(url),
# Additional features
digit_ratio(url),
letter_ratio(url),
special_char_ratio(url),
get_query_length(url),
get_fragment_length(url),
healthcare_relevance_score(url)
]
return features
def get_feature_names():
"""Get names of all features in the order they are extracted."""
return [
'num_dots', 'subdomain_level', 'path_level', 'url_length',
'num_hyphens', 'num_at', 'num_tilde', 'num_underscore',
'num_percent', 'num_ampersand', 'num_hash', 'has_https',
'has_ip_address', 'hostname_length', 'path_length',
'double_slash_in_path', 'digit_ratio', 'letter_ratio',
'special_char_ratio', 'query_length', 'fragment_length',
'healthcare_relevance'
]
# --- Dataset Loading and Processing ---
class URLDataset(Dataset):
def __init__(self, features, labels):
"""
Custom PyTorch Dataset for URL features and labels.
Args:
features (numpy.ndarray): Feature vectors for each URL
labels (numpy.ndarray): Labels for each URL (0 for benign, 1 for malicious)
"""
self.features = torch.tensor(features, dtype=torch.float32)
self.labels = torch.tensor(labels, dtype=torch.long)
def __len__(self):
return len(self.labels)
def __getitem__(self, idx):
return self.features[idx], self.labels[idx]
def load_huggingface_data(file_path):
"""
Load the Hugging Face dataset from a JSON file.
Args:
file_path: Path to the JSON file
Returns:
List of tuples containing (url, label)
"""
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
url_data = []
for item in data:
url = item.get('text', '')
label = item.get('label', -1)
if url and label != -1: # Only add entries with valid URLs and labels
url_data.append((url, label))
print(f"Loaded {len(url_data)} URLs from Hugging Face dataset")
return url_data
def load_phiusiil_data(file_path):
"""
Load the PhiUSIIL dataset from a CSV file.
Args:
file_path: Path to the CSV file
Returns:
List of tuples containing (url, label)
"""
df = pd.read_csv(file_path)
url_data = []
for _, row in df.iterrows():
url = row['URL']
label = row['label']
if isinstance(url, str) and url.strip() and not pd.isna(label):
url_data.append((url, label))
print(f"Loaded {len(url_data)} URLs from PhiUSIIL dataset")
return url_data
def load_kaggle_data(file_path):
"""
Load the Kaggle malicious_phish.csv dataset.
Args:
file_path: Path to the CSV file
Returns:
List of tuples containing (url, label)
"""
df = pd.read_csv(file_path)
url_data = []
for _, row in df.iterrows():
url = row['url']
type_val = row['type']
# Convert to binary classification (0 for benign, 1 for all others)
label = 0 if type_val.lower() == 'benign' else 1
if isinstance(url, str) and url.strip():
url_data.append((url, label))
print(f"Loaded {len(url_data)} URLs from Kaggle dataset")
return url_data
def combine_and_deduplicate(datasets):
"""
Combine multiple datasets and remove duplicates by URL.
Args:
datasets: List of datasets, each containing (url, label) tuples
Returns:
Tuple of (urls, labels) with duplicates removed
"""
url_to_label = {}
# Process each dataset
for dataset in datasets:
for url, label in dataset:
# If we've seen this URL before with a different label,
# prefer the malicious label (1) for safety
if url in url_to_label:
url_to_label[url] = max(url_to_label[url], label)
else:
url_to_label[url] = label
# Convert to lists
urls = list(url_to_label.keys())
labels = list(url_to_label.values())
print(f"After deduplication: {len(urls)} unique URLs")
# Report class distribution
label_counts = Counter(labels)
print(f"Class distribution - Benign (0): {label_counts[0]}, Malicious (1): {label_counts[1]}")
return urls, labels
def extract_all_features(urls):
"""
Extract features from a list of URLs.
Args:
urls: List of URL strings
Returns:
Numpy array of feature vectors
"""
feature_vectors = []
# Use tqdm for a progress bar
for url in tqdm.tqdm(urls, desc="Extracting features"):
try:
features = extract_features(url)
feature_vectors.append(features)
except Exception as e:
print(f"Error extracting features from {url}: {str(e)}")
# Insert a vector of zeros in case of error
feature_vectors.append([0] * len(get_feature_names()))
return np.array(feature_vectors, dtype=np.float32)
# --- MLP Model ---
class PhishingMLP(nn.Module):
def __init__(self, input_size=22, hidden_sizes=[22, 30, 10], output_size=1):
"""
Multilayer Perceptron for Phishing URL Detection.
Args:
input_size: Number of input features (default: 22)
hidden_sizes: List of neurons in each hidden layer
output_size: Number of output classes (1 for binary)
"""
super(PhishingMLP, self).__init__()
self.layers = nn.ModuleList()
# Input layer to first hidden layer
self.layers.append(nn.Linear(input_size, hidden_sizes[0]))
self.layers.append(nn.ReLU())
# Hidden layers
for i in range(len(hidden_sizes) - 1):
self.layers.append(nn.Linear(hidden_sizes[i], hidden_sizes[i+1]))
self.layers.append(nn.ReLU())
# Output layer
self.layers.append(nn.Linear(hidden_sizes[-1], output_size))
self.layers.append(nn.Sigmoid())
def forward(self, x):
"""Forward pass through the network."""
for layer in self.layers:
x = layer(x)
return x
# --- Training Functions ---
def train_mlp(model, train_loader, val_loader, epochs=25, learning_rate=0.001, device="cpu"):
"""
Train the MLP model.
Args:
model: The MLP model
train_loader: DataLoader for training data
val_loader: DataLoader for validation data
epochs: Number of training epochs
learning_rate: Learning rate for optimization
device: Device to train on (cpu or cuda)
Returns:
Tuple of (trained_model, train_losses, val_losses, val_accuracies)
"""
model.to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
train_losses = []
val_losses = []
val_accuracies = []
print(f"Training on {device}...")
for epoch in range(epochs):
# Training phase
model.train()
running_loss = 0.0
for inputs, labels in train_loader:
inputs, labels = inputs.to(device), labels.to(device)
# Zero the parameter gradients
optimizer.zero_grad()
# Forward + backward + optimize
outputs = model(inputs)
loss = criterion(outputs, labels.unsqueeze(1).float())
loss.backward()
optimizer.step()
running_loss += loss.item()
# Calculate average training loss
epoch_train_loss = running_loss / len(train_loader)
train_losses.append(epoch_train_loss)
# Validation phase
model.eval()
val_loss = 0.0
correct = 0
total = 0
with torch.no_grad():
for inputs, labels in val_loader:
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
# Calculate validation loss
loss = criterion(outputs, labels.unsqueeze(1).float())
val_loss += loss.item()
# Calculate accuracy
predicted = (outputs > 0.5).float()
total += labels.size(0)
correct += (predicted.squeeze() == labels.float()).sum().item()
# Calculate average validation loss and accuracy
epoch_val_loss = val_loss / len(val_loader)
val_losses.append(epoch_val_loss)
val_accuracy = 100 * correct / total
val_accuracies.append(val_accuracy)
# Print progress
print(f"Epoch {epoch+1}/{epochs}, Train Loss: {epoch_train_loss:.4f}, Val Loss: {epoch_val_loss:.4f}, Val Acc: {val_accuracy:.2f}%")
return model, train_losses, val_losses, val_accuracies
def evaluate_model(model, test_loader, device):
"""
Evaluate the trained model on test data.
Args:
model: Trained model
test_loader: DataLoader for test data
device: Device to evaluate on
Returns:
Tuple of (accuracy, precision, recall, f1_score)
"""
model.to(device)
model.eval()
correct = 0
total = 0
true_positives = 0
false_positives = 0
false_negatives = 0
healthcare_correct = 0
healthcare_total = 0
feature_idx = get_feature_names().index('healthcare_relevance')
healthcare_threshold = 0.5 # Threshold for considering a URL healthcare-related
with torch.no_grad():
for inputs, labels in test_loader:
inputs, labels = inputs.to(device), labels.to(device)
# Forward pass
outputs = model(inputs)
predicted = (outputs > 0.5).float().squeeze()
# Update counts
total += labels.size(0)
correct += (predicted == labels.float()).sum().item()
# Metrics calculation
for i in range(labels.size(0)):
if labels[i] == 1 and predicted[i] == 1:
true_positives += 1
elif labels[i] == 0 and predicted[i] == 1:
false_positives += 1
elif labels[i] == 1 and predicted[i] == 0:
false_negatives += 1
# Check healthcare relevance
if inputs[i, feature_idx] >= healthcare_threshold:
healthcare_total += 1
if predicted[i] == labels[i]:
healthcare_correct += 1
# Calculate metrics
accuracy = 100 * correct / total
precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0.0
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0.0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
# Healthcare-specific accuracy
healthcare_accuracy = 100 * healthcare_correct / healthcare_total if healthcare_total > 0 else 0.0
print(f"Overall Test Accuracy: {accuracy:.2f}%")
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")
print(f"Healthcare URLs identified: {healthcare_total} ({healthcare_total/total*100:.2f}%)")
print(f"Healthcare URL Detection Accuracy: {healthcare_accuracy:.2f}%")
return accuracy, precision, recall, f1, healthcare_accuracy
def plot_training_results(train_losses, val_losses, val_accuracies):
"""
Plot training metrics.
Args:
train_losses: List of training losses
val_losses: List of validation losses
val_accuracies: List of validation accuracies
"""
plt.figure(figsize=(15, 5))
# Plot losses
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
# Plot accuracy
plt.subplot(1, 2, 2)
plt.plot(val_accuracies, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy (%)')
plt.title('Validation Accuracy')
plt.legend()
plt.tight_layout()
plt.savefig('training_results.png')
plt.show()
def analyze_healthcare_features(features, labels, pred_labels):
"""
Analyze how the model performs on healthcare-related URLs.
Args:
features: Feature vectors
labels: True labels
pred_labels: Predicted labels
"""
healthcare_idx = get_feature_names().index('healthcare_relevance')
healthcare_scores = features[:, healthcare_idx]
# Define thresholds
thresholds = [0.1, 0.3, 0.5, 0.7, 0.9]
print("\n=== Healthcare URL Analysis ===")
print("Healthcare relevance score distribution:")
for threshold in thresholds:
count = np.sum(healthcare_scores >= threshold)
percent = (count / len(healthcare_scores)) * 100
print(f" Score >= {threshold}: {count} URLs ({percent:.2f}%)")
# Analyze performance at different healthcare relevance levels
for threshold in thresholds:
mask = healthcare_scores >= threshold
if np.sum(mask) == 0:
continue
h_labels = labels[mask]
h_preds = pred_labels[mask]
h_accuracy = np.mean(h_labels == h_preds) * 100
benign_count = np.sum(h_labels == 0)
malicious_count = np.sum(h_labels == 1)
print(f"\nFor healthcare relevance >= {threshold}:")
print(f" URLs: {np.sum(mask)} ({benign_count} benign, {malicious_count} malicious)")
print(f" Accuracy: {h_accuracy:.2f}%")
# Calculate healthcare-specific metrics
tp = np.sum((h_labels == 1) & (h_preds == 1))
fp = np.sum((h_labels == 0) & (h_preds == 1))
fn = np.sum((h_labels == 1) & (h_preds == 0))
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
print(f" Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
# Calculate false positive rate for healthcare URLs
if benign_count > 0:
h_fpr = np.sum((h_labels == 0) & (h_preds == 1)) / benign_count
print(f" False Positive Rate: {h_fpr:.4f}")
# Calculate false negative rate for healthcare URLs
if malicious_count > 0:
h_fnr = np.sum((h_labels == 1) & (h_preds == 0)) / malicious_count
print(f" False Negative Rate: {h_fnr:.4f}")
# --- Main Function ---
def main():
"""Main function to run the entire pipeline."""
# Configuration
batch_size = 32
learning_rate = 0.001
epochs = 20
test_size = 0.2
val_size = 0.2
random_seed = 42
device = "cuda" if torch.cuda.is_available() else "cpu"
# Filenames
huggingface_file = "urls.json"
phiusiil_file = "PhiUSIIL_Phishing_URL_Dataset.csv"
kaggle_file = "malicious_phish.csv"
# Load datasets
print("Loading datasets...")
huggingface_data = load_huggingface_data(huggingface_file)
phiusiil_data = load_phiusiil_data(phiusiil_file)
kaggle_data = load_kaggle_data(kaggle_file)
# Combine and deduplicate datasets
print("Combining and deduplicating datasets...")
urls, labels = combine_and_deduplicate([huggingface_data, phiusiil_data, kaggle_data])
# Extract features
print("Extracting features...")
features = extract_all_features(urls)
# Split into train, validation, and test sets
print("Splitting data...")
X_train_val, X_test, y_train_val, y_test = train_test_split(
features, labels, test_size=test_size, random_state=random_seed, stratify=labels
)
X_train, X_val, y_train, y_val = train_test_split(
X_train_val, y_train_val, test_size=val_size/(1-test_size),
random_state=random_seed, stratify=y_train_val
)
# Standardize features
print("Standardizing features...")
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)
# Create PyTorch datasets and dataloaders
print("Creating DataLoaders...")
train_dataset = URLDataset(X_train, y_train)
val_dataset = URLDataset(X_val, y_val)
test_dataset = URLDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
# Initialize and train model
print("Initializing model...")
input_size = features.shape[1] # Number of features
model = PhishingMLP(input_size=input_size)
print("Training model...")
trained_model, train_losses, val_losses, val_accuracies = train_mlp(
model, train_loader, val_loader, epochs=epochs,
learning_rate=learning_rate, device=device
)
# Save trained model
print("Saving model...")
model_path = "phishing_mlp_model.pth"
torch.save(trained_model.state_dict(), model_path)
print(f"Model saved to {model_path}")
# Evaluate on test set
print("\nEvaluating model on test set...")
acc, prec, rec, f1, healthcare_acc = evaluate_model(trained_model, test_loader, device)
# Plot results
plot_training_results(train_losses, val_losses, val_accuracies)
# Further healthcare analysis
y_pred = []
trained_model.eval()
with torch.no_grad():
for inputs, _ in test_loader:
inputs = inputs.to(device)
outputs = trained_model(inputs)
predicted = (outputs > 0.5).float().squeeze().cpu().numpy()
y_pred.extend(predicted.tolist())
analyze_healthcare_features(X_test, np.array(y_test), np.array(y_pred))
# Print feature importance summary
feature_names = get_feature_names()
healthcare_idx = feature_names.index('healthcare_relevance')
healthcare_scores = features[:, healthcare_idx]
high_healthcare = healthcare_scores >= 0.5
print("\n=== Healthcare URL Examples ===")
high_healthcare_indices = np.where(high_healthcare)[0][:5] # Get first 5 indices
for idx in high_healthcare_indices:
print(f"URL: {urls[idx]}")
print(f"Healthcare Score: {healthcare_scores[idx]:.2f}")
print(f"Label: {'Malicious' if labels[idx] == 1 else 'Benign'}")
print()
# Summary
print("\n=== Summary ===")
print(f"Total URLs processed: {len(urls)}")
print(f"Training set: {len(X_train)} URLs")
print(f"Validation set: {len(X_val)} URLs")
print(f"Test set: {len(X_test)} URLs")
print(f"Model input features: {input_size}")
print(f"Test Accuracy: {acc:.2f}%")
print(f"Healthcare URL Accuracy: {healthcare_acc:.2f}%")
print(f"Precision: {prec:.4f}, Recall: {rec:.4f}, F1-Score: {f1:.4f}")
print("\nTraining complete!")
if __name__ == "__main__":
main()