import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import Dataset, DataLoader, random_split import pandas as pd import numpy as np import json import os import re import urllib.parse import matplotlib.pyplot as plt from collections import Counter from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler import tqdm # --- Healthcare URL Detection Components --- # Healthcare-related keywords for domain detection HEALTHCARE_KEYWORDS = [ 'health', 'medical', 'hospital', 'clinic', 'pharma', 'patient', 'care', 'med', 'doctor', 'physician', 'nurse', 'therapy', 'rehab', 'dental', 'cardio', 'neuro', 'oncology', 'pediatric', 'orthopedic', 'surgery', 'diagnostic', 'wellbeing', 'wellness', 'ehr', 'emr', 'mychart', 'medicare', 'medicaid', 'insurance' ] # Common healthcare institutions and systems HEALTHCARE_INSTITUTIONS = [ 'mayo', 'cleveland', 'hopkins', 'kaiser', 'mount sinai', 'cedars', 'baylor', 'nhs', 'quest', 'labcorp', 'cvs', 'walgreens', 'aetna', 'cigna', 'unitedhealthcare', 'bluecross', 'anthem', 'humana', 'va.gov', 'cdc', 'who', 'nih' ] # Healthcare TLDs and specific domains HEALTHCARE_DOMAINS = ['.health', '.healthcare', '.medicine', '.hospital', '.clinic', 'mychart.'] # --- Feature Extraction Functions --- def url_length(url): """Return the length of the URL.""" return len(url) def num_dots(url): """Return the number of dots in the URL.""" return url.count('.') def num_hyphens(url): """Return the number of hyphens in the URL.""" return url.count('-') def num_at(url): """Return the number of @ symbols in the URL.""" return url.count('@') def num_tilde(url): """Return the number of ~ symbols in the URL.""" return url.count('~') def num_underscore(url): """Return the number of underscores in the URL.""" return url.count('_') def num_percent(url): """Return the number of percent symbols in the URL.""" return url.count('%') def num_ampersand(url): """Return the number of ampersands in the URL.""" return url.count('&') def num_hash(url): """Return the number of hash symbols in the URL.""" return url.count('#') def has_https(url): """Return 1 if the URL uses HTTPS, 0 otherwise.""" return int(url.startswith('https://')) def has_ip_address(url): """Check if the URL contains an IP address instead of a domain name.""" try: parsed_url = urllib.parse.urlparse(url) if re.match(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$', parsed_url.netloc): return 1 # Check for IPv6 if re.match(r'^\[[0-9a-fA-F:]+\]$', parsed_url.netloc): return 1 return 0 except: return 0 def get_hostname_length(url): """Return the length of the hostname.""" try: parsed_url = urllib.parse.urlparse(url) return len(parsed_url.netloc) except: return 0 def get_path_length(url): """Return the length of the path.""" try: parsed_url = urllib.parse.urlparse(url) return len(parsed_url.path) except: return 0 def get_path_level(url): """Return the number of directories in the path.""" try: parsed_url = urllib.parse.urlparse(url) return parsed_url.path.count('/') except: return 0 def get_subdomain_level(url): """Return the number of subdomains in the URL.""" try: parsed_url = urllib.parse.urlparse(url) hostname = parsed_url.netloc if has_ip_address(url): return 0 # IP addresses don't have subdomains parts = hostname.split('.') # Remove top-level and second-level domains if len(parts) > 2: return len(parts) - 2 # Count remaining parts as subdomain levels else: return 0 # No subdomains except: return 0 def has_double_slash_in_path(url): """Check if the path contains a double slash.""" try: parsed_url = urllib.parse.urlparse(url) return int('//' in parsed_url.path) except: return 0 def get_tld(url): """Extract the top-level domain from a URL.""" try: parsed_url = urllib.parse.urlparse(url) hostname = parsed_url.netloc.lower() parts = hostname.split('.') if len(parts) > 1: return parts[-1] return '' except: return '' def count_digits(url): """Count the number of digits in the URL.""" return sum(c.isdigit() for c in url) def digit_ratio(url): """Calculate the ratio of digits to the total URL length.""" if len(url) == 0: return 0 return count_digits(url) / len(url) def count_letters(url): """Count the number of letters in the URL.""" return sum(c.isalpha() for c in url) def letter_ratio(url): """Calculate the ratio of letters to the total URL length.""" if len(url) == 0: return 0 return count_letters(url) / len(url) def count_special_chars(url): """Count the number of special characters in the URL.""" return sum(not c.isalnum() and not c.isspace() for c in url) def special_char_ratio(url): """Calculate the ratio of special characters to the total URL length.""" if len(url) == 0: return 0 return count_special_chars(url) / len(url) def get_query_length(url): """Return the length of the query string.""" try: parsed_url = urllib.parse.urlparse(url) return len(parsed_url.query) except: return 0 def get_fragment_length(url): """Return the length of the fragment.""" try: parsed_url = urllib.parse.urlparse(url) return len(parsed_url.fragment) except: return 0 def healthcare_relevance_score(url): """ Calculate a relevance score for healthcare-related URLs. Higher scores indicate stronger relation to healthcare. """ url_lower = url.lower() parsed_url = urllib.parse.urlparse(url_lower) domain = parsed_url.netloc path = parsed_url.path score = 0 # Check for healthcare keywords in domain for keyword in HEALTHCARE_KEYWORDS: if keyword in domain: score += 3 elif keyword in path: score += 1 # Check for healthcare institutions for institution in HEALTHCARE_INSTITUTIONS: if institution in domain: score += 4 elif institution in path: score += 2 # Check for healthcare-specific domains and TLDs for healthcare_domain in HEALTHCARE_DOMAINS: if healthcare_domain in domain: score += 3 # Check for EHR/patient portal indicators if 'portal' in domain or 'portal' in path: score += 2 if 'patient' in domain or 'mychart' in domain: score += 3 if 'ehr' in domain or 'emr' in domain: score += 3 # Normalize score to be between 0 and 1 return min(score / 10.0, 1.0) def extract_features(url): """Extract all features from a given URL.""" features = [ # Core features (the original 17) num_dots(url), get_subdomain_level(url), get_path_level(url), url_length(url), num_hyphens(url), num_at(url), num_tilde(url), num_underscore(url), num_percent(url), num_ampersand(url), num_hash(url), has_https(url), has_ip_address(url), get_hostname_length(url), get_path_length(url), has_double_slash_in_path(url), # Additional features digit_ratio(url), letter_ratio(url), special_char_ratio(url), get_query_length(url), get_fragment_length(url), healthcare_relevance_score(url) ] return features def get_feature_names(): """Get names of all features in the order they are extracted.""" return [ 'num_dots', 'subdomain_level', 'path_level', 'url_length', 'num_hyphens', 'num_at', 'num_tilde', 'num_underscore', 'num_percent', 'num_ampersand', 'num_hash', 'has_https', 'has_ip_address', 'hostname_length', 'path_length', 'double_slash_in_path', 'digit_ratio', 'letter_ratio', 'special_char_ratio', 'query_length', 'fragment_length', 'healthcare_relevance' ] # --- Dataset Loading and Processing --- class URLDataset(Dataset): def __init__(self, features, labels): """ Custom PyTorch Dataset for URL features and labels. Args: features (numpy.ndarray): Feature vectors for each URL labels (numpy.ndarray): Labels for each URL (0 for benign, 1 for malicious) """ self.features = torch.tensor(features, dtype=torch.float32) self.labels = torch.tensor(labels, dtype=torch.long) def __len__(self): return len(self.labels) def __getitem__(self, idx): return self.features[idx], self.labels[idx] def load_huggingface_data(file_path): """ Load the Hugging Face dataset from a JSON file. Args: file_path: Path to the JSON file Returns: List of tuples containing (url, label) """ with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) url_data = [] for item in data: url = item.get('text', '') label = item.get('label', -1) if url and label != -1: # Only add entries with valid URLs and labels url_data.append((url, label)) print(f"Loaded {len(url_data)} URLs from Hugging Face dataset") return url_data def load_phiusiil_data(file_path): """ Load the PhiUSIIL dataset from a CSV file. Args: file_path: Path to the CSV file Returns: List of tuples containing (url, label) """ df = pd.read_csv(file_path) url_data = [] for _, row in df.iterrows(): url = row['URL'] label = row['label'] if isinstance(url, str) and url.strip() and not pd.isna(label): url_data.append((url, label)) print(f"Loaded {len(url_data)} URLs from PhiUSIIL dataset") return url_data def load_kaggle_data(file_path): """ Load the Kaggle malicious_phish.csv dataset. Args: file_path: Path to the CSV file Returns: List of tuples containing (url, label) """ df = pd.read_csv(file_path) url_data = [] for _, row in df.iterrows(): url = row['url'] type_val = row['type'] # Convert to binary classification (0 for benign, 1 for all others) label = 0 if type_val.lower() == 'benign' else 1 if isinstance(url, str) and url.strip(): url_data.append((url, label)) print(f"Loaded {len(url_data)} URLs from Kaggle dataset") return url_data def combine_and_deduplicate(datasets): """ Combine multiple datasets and remove duplicates by URL. Args: datasets: List of datasets, each containing (url, label) tuples Returns: Tuple of (urls, labels) with duplicates removed """ url_to_label = {} # Process each dataset for dataset in datasets: for url, label in dataset: # If we've seen this URL before with a different label, # prefer the malicious label (1) for safety if url in url_to_label: url_to_label[url] = max(url_to_label[url], label) else: url_to_label[url] = label # Convert to lists urls = list(url_to_label.keys()) labels = list(url_to_label.values()) print(f"After deduplication: {len(urls)} unique URLs") # Report class distribution label_counts = Counter(labels) print(f"Class distribution - Benign (0): {label_counts[0]}, Malicious (1): {label_counts[1]}") return urls, labels def extract_all_features(urls): """ Extract features from a list of URLs. Args: urls: List of URL strings Returns: Numpy array of feature vectors """ feature_vectors = [] # Use tqdm for a progress bar for url in tqdm.tqdm(urls, desc="Extracting features"): try: features = extract_features(url) feature_vectors.append(features) except Exception as e: print(f"Error extracting features from {url}: {str(e)}") # Insert a vector of zeros in case of error feature_vectors.append([0] * len(get_feature_names())) return np.array(feature_vectors, dtype=np.float32) # --- MLP Model --- class PhishingMLP(nn.Module): def __init__(self, input_size=22, hidden_sizes=[22, 30, 10], output_size=1): """ Multilayer Perceptron for Phishing URL Detection. Args: input_size: Number of input features (default: 22) hidden_sizes: List of neurons in each hidden layer output_size: Number of output classes (1 for binary) """ super(PhishingMLP, self).__init__() self.layers = nn.ModuleList() # Input layer to first hidden layer self.layers.append(nn.Linear(input_size, hidden_sizes[0])) self.layers.append(nn.ReLU()) # Hidden layers for i in range(len(hidden_sizes) - 1): self.layers.append(nn.Linear(hidden_sizes[i], hidden_sizes[i+1])) self.layers.append(nn.ReLU()) # Output layer self.layers.append(nn.Linear(hidden_sizes[-1], output_size)) self.layers.append(nn.Sigmoid()) def forward(self, x): """Forward pass through the network.""" for layer in self.layers: x = layer(x) return x # --- Training Functions --- def train_mlp(model, train_loader, val_loader, epochs=25, learning_rate=0.001, device="cpu"): """ Train the MLP model. Args: model: The MLP model train_loader: DataLoader for training data val_loader: DataLoader for validation data epochs: Number of training epochs learning_rate: Learning rate for optimization device: Device to train on (cpu or cuda) Returns: Tuple of (trained_model, train_losses, val_losses, val_accuracies) """ model.to(device) criterion = nn.BCELoss() optimizer = optim.Adam(model.parameters(), lr=learning_rate) train_losses = [] val_losses = [] val_accuracies = [] print(f"Training on {device}...") for epoch in range(epochs): # Training phase model.train() running_loss = 0.0 for inputs, labels in train_loader: inputs, labels = inputs.to(device), labels.to(device) # Zero the parameter gradients optimizer.zero_grad() # Forward + backward + optimize outputs = model(inputs) loss = criterion(outputs, labels.unsqueeze(1).float()) loss.backward() optimizer.step() running_loss += loss.item() # Calculate average training loss epoch_train_loss = running_loss / len(train_loader) train_losses.append(epoch_train_loss) # Validation phase model.eval() val_loss = 0.0 correct = 0 total = 0 with torch.no_grad(): for inputs, labels in val_loader: inputs, labels = inputs.to(device), labels.to(device) outputs = model(inputs) # Calculate validation loss loss = criterion(outputs, labels.unsqueeze(1).float()) val_loss += loss.item() # Calculate accuracy predicted = (outputs > 0.5).float() total += labels.size(0) correct += (predicted.squeeze() == labels.float()).sum().item() # Calculate average validation loss and accuracy epoch_val_loss = val_loss / len(val_loader) val_losses.append(epoch_val_loss) val_accuracy = 100 * correct / total val_accuracies.append(val_accuracy) # Print progress print(f"Epoch {epoch+1}/{epochs}, Train Loss: {epoch_train_loss:.4f}, Val Loss: {epoch_val_loss:.4f}, Val Acc: {val_accuracy:.2f}%") return model, train_losses, val_losses, val_accuracies def evaluate_model(model, test_loader, device): """ Evaluate the trained model on test data. Args: model: Trained model test_loader: DataLoader for test data device: Device to evaluate on Returns: Tuple of (accuracy, precision, recall, f1_score) """ model.to(device) model.eval() correct = 0 total = 0 true_positives = 0 false_positives = 0 false_negatives = 0 healthcare_correct = 0 healthcare_total = 0 feature_idx = get_feature_names().index('healthcare_relevance') healthcare_threshold = 0.5 # Threshold for considering a URL healthcare-related with torch.no_grad(): for inputs, labels in test_loader: inputs, labels = inputs.to(device), labels.to(device) # Forward pass outputs = model(inputs) predicted = (outputs > 0.5).float().squeeze() # Update counts total += labels.size(0) correct += (predicted == labels.float()).sum().item() # Metrics calculation for i in range(labels.size(0)): if labels[i] == 1 and predicted[i] == 1: true_positives += 1 elif labels[i] == 0 and predicted[i] == 1: false_positives += 1 elif labels[i] == 1 and predicted[i] == 0: false_negatives += 1 # Check healthcare relevance if inputs[i, feature_idx] >= healthcare_threshold: healthcare_total += 1 if predicted[i] == labels[i]: healthcare_correct += 1 # Calculate metrics accuracy = 100 * correct / total precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0.0 recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0.0 f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0 # Healthcare-specific accuracy healthcare_accuracy = 100 * healthcare_correct / healthcare_total if healthcare_total > 0 else 0.0 print(f"Overall Test Accuracy: {accuracy:.2f}%") print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}") print(f"Healthcare URLs identified: {healthcare_total} ({healthcare_total/total*100:.2f}%)") print(f"Healthcare URL Detection Accuracy: {healthcare_accuracy:.2f}%") return accuracy, precision, recall, f1, healthcare_accuracy def plot_training_results(train_losses, val_losses, val_accuracies): """ Plot training metrics. Args: train_losses: List of training losses val_losses: List of validation losses val_accuracies: List of validation accuracies """ plt.figure(figsize=(15, 5)) # Plot losses plt.subplot(1, 2, 1) plt.plot(train_losses, label='Training Loss') plt.plot(val_losses, label='Validation Loss') plt.xlabel('Epoch') plt.ylabel('Loss') plt.title('Training and Validation Loss') plt.legend() # Plot accuracy plt.subplot(1, 2, 2) plt.plot(val_accuracies, label='Validation Accuracy') plt.xlabel('Epoch') plt.ylabel('Accuracy (%)') plt.title('Validation Accuracy') plt.legend() plt.tight_layout() plt.savefig('training_results.png') plt.show() def analyze_healthcare_features(features, labels, pred_labels): """ Analyze how the model performs on healthcare-related URLs. Args: features: Feature vectors labels: True labels pred_labels: Predicted labels """ healthcare_idx = get_feature_names().index('healthcare_relevance') healthcare_scores = features[:, healthcare_idx] # Define thresholds thresholds = [0.1, 0.3, 0.5, 0.7, 0.9] print("\n=== Healthcare URL Analysis ===") print("Healthcare relevance score distribution:") for threshold in thresholds: count = np.sum(healthcare_scores >= threshold) percent = (count / len(healthcare_scores)) * 100 print(f" Score >= {threshold}: {count} URLs ({percent:.2f}%)") # Analyze performance at different healthcare relevance levels for threshold in thresholds: mask = healthcare_scores >= threshold if np.sum(mask) == 0: continue h_labels = labels[mask] h_preds = pred_labels[mask] h_accuracy = np.mean(h_labels == h_preds) * 100 benign_count = np.sum(h_labels == 0) malicious_count = np.sum(h_labels == 1) print(f"\nFor healthcare relevance >= {threshold}:") print(f" URLs: {np.sum(mask)} ({benign_count} benign, {malicious_count} malicious)") print(f" Accuracy: {h_accuracy:.2f}%") # Calculate healthcare-specific metrics tp = np.sum((h_labels == 1) & (h_preds == 1)) fp = np.sum((h_labels == 0) & (h_preds == 1)) fn = np.sum((h_labels == 1) & (h_preds == 0)) precision = tp / (tp + fp) if (tp + fp) > 0 else 0 recall = tp / (tp + fn) if (tp + fn) > 0 else 0 f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0 print(f" Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}") # Calculate false positive rate for healthcare URLs if benign_count > 0: h_fpr = np.sum((h_labels == 0) & (h_preds == 1)) / benign_count print(f" False Positive Rate: {h_fpr:.4f}") # Calculate false negative rate for healthcare URLs if malicious_count > 0: h_fnr = np.sum((h_labels == 1) & (h_preds == 0)) / malicious_count print(f" False Negative Rate: {h_fnr:.4f}") # --- Main Function --- def main(): """Main function to run the entire pipeline.""" # Configuration batch_size = 32 learning_rate = 0.001 epochs = 20 test_size = 0.2 val_size = 0.2 random_seed = 42 device = "cuda" if torch.cuda.is_available() else "cpu" # Filenames huggingface_file = "urls.json" phiusiil_file = "PhiUSIIL_Phishing_URL_Dataset.csv" kaggle_file = "malicious_phish.csv" # Load datasets print("Loading datasets...") huggingface_data = load_huggingface_data(huggingface_file) phiusiil_data = load_phiusiil_data(phiusiil_file) kaggle_data = load_kaggle_data(kaggle_file) # Combine and deduplicate datasets print("Combining and deduplicating datasets...") urls, labels = combine_and_deduplicate([huggingface_data, phiusiil_data, kaggle_data]) # Extract features print("Extracting features...") features = extract_all_features(urls) # Split into train, validation, and test sets print("Splitting data...") X_train_val, X_test, y_train_val, y_test = train_test_split( features, labels, test_size=test_size, random_state=random_seed, stratify=labels ) X_train, X_val, y_train, y_val = train_test_split( X_train_val, y_train_val, test_size=val_size/(1-test_size), random_state=random_seed, stratify=y_train_val ) # Standardize features print("Standardizing features...") scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_val = scaler.transform(X_val) X_test = scaler.transform(X_test) # Create PyTorch datasets and dataloaders print("Creating DataLoaders...") train_dataset = URLDataset(X_train, y_train) val_dataset = URLDataset(X_val, y_val) test_dataset = URLDataset(X_test, y_test) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False) test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) # Initialize and train model print("Initializing model...") input_size = features.shape[1] # Number of features model = PhishingMLP(input_size=input_size) print("Training model...") trained_model, train_losses, val_losses, val_accuracies = train_mlp( model, train_loader, val_loader, epochs=epochs, learning_rate=learning_rate, device=device ) # Save trained model print("Saving model...") model_path = "phishing_mlp_model.pth" torch.save(trained_model.state_dict(), model_path) print(f"Model saved to {model_path}") # Evaluate on test set print("\nEvaluating model on test set...") acc, prec, rec, f1, healthcare_acc = evaluate_model(trained_model, test_loader, device) # Plot results plot_training_results(train_losses, val_losses, val_accuracies) # Further healthcare analysis y_pred = [] trained_model.eval() with torch.no_grad(): for inputs, _ in test_loader: inputs = inputs.to(device) outputs = trained_model(inputs) predicted = (outputs > 0.5).float().squeeze().cpu().numpy() y_pred.extend(predicted.tolist()) analyze_healthcare_features(X_test, np.array(y_test), np.array(y_pred)) # Print feature importance summary feature_names = get_feature_names() healthcare_idx = feature_names.index('healthcare_relevance') healthcare_scores = features[:, healthcare_idx] high_healthcare = healthcare_scores >= 0.5 print("\n=== Healthcare URL Examples ===") high_healthcare_indices = np.where(high_healthcare)[0][:5] # Get first 5 indices for idx in high_healthcare_indices: print(f"URL: {urls[idx]}") print(f"Healthcare Score: {healthcare_scores[idx]:.2f}") print(f"Label: {'Malicious' if labels[idx] == 1 else 'Benign'}") print() # Summary print("\n=== Summary ===") print(f"Total URLs processed: {len(urls)}") print(f"Training set: {len(X_train)} URLs") print(f"Validation set: {len(X_val)} URLs") print(f"Test set: {len(X_test)} URLs") print(f"Model input features: {input_size}") print(f"Test Accuracy: {acc:.2f}%") print(f"Healthcare URL Accuracy: {healthcare_acc:.2f}%") print(f"Precision: {prec:.4f}, Recall: {rec:.4f}, F1-Score: {f1:.4f}") print("\nTraining complete!") if __name__ == "__main__": main()