File size: 27,075 Bytes

09c1340

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd
import numpy as np
import json
import os
import re
import urllib.parse
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tqdm

# --- Healthcare URL Detection Components ---

# Healthcare-related keywords for domain detection
HEALTHCARE_KEYWORDS = [
    'health', 'medical', 'hospital', 'clinic', 'pharma', 'patient', 'care', 'med',
    'doctor', 'physician', 'nurse', 'therapy', 'rehab', 'dental', 'cardio', 'neuro',
    'oncology', 'pediatric', 'orthopedic', 'surgery', 'diagnostic', 'wellbeing',
    'wellness', 'ehr', 'emr', 'mychart', 'medicare', 'medicaid', 'insurance'
]

# Common healthcare institutions and systems
HEALTHCARE_INSTITUTIONS = [
    'mayo', 'cleveland', 'hopkins', 'kaiser', 'mount sinai', 'cedars', 'baylor',
    'nhs', 'quest', 'labcorp', 'cvs', 'walgreens', 'aetna', 'cigna', 'unitedhealthcare',
    'bluecross', 'anthem', 'humana', 'va.gov', 'cdc', 'who', 'nih'
]

# Healthcare TLDs and specific domains
HEALTHCARE_DOMAINS = ['.health', '.healthcare', '.medicine', '.hospital', '.clinic', 'mychart.']

# --- Feature Extraction Functions ---

def url_length(url):
    """Return the length of the URL."""
    return len(url)

def num_dots(url):
    """Return the number of dots in the URL."""
    return url.count('.')

def num_hyphens(url):
    """Return the number of hyphens in the URL."""
    return url.count('-')

def num_at(url):
    """Return the number of @ symbols in the URL."""
    return url.count('@')

def num_tilde(url):
    """Return the number of ~ symbols in the URL."""
    return url.count('~')

def num_underscore(url):
    """Return the number of underscores in the URL."""
    return url.count('_')

def num_percent(url):
    """Return the number of percent symbols in the URL."""
    return url.count('%')

def num_ampersand(url):
    """Return the number of ampersands in the URL."""
    return url.count('&')

def num_hash(url):
    """Return the number of hash symbols in the URL."""
    return url.count('#')

def has_https(url):
    """Return 1 if the URL uses HTTPS, 0 otherwise."""
    return int(url.startswith('https://'))

def has_ip_address(url):
    """Check if the URL contains an IP address instead of a domain name."""
    try:
        parsed_url = urllib.parse.urlparse(url)
        if re.match(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$', parsed_url.netloc):
            return 1
        # Check for IPv6
        if re.match(r'^\[[0-9a-fA-F:]+\]$', parsed_url.netloc):
            return 1
        return 0
    except:
        return 0

def get_hostname_length(url):
    """Return the length of the hostname."""
    try:
        parsed_url = urllib.parse.urlparse(url)
        return len(parsed_url.netloc)
    except:
        return 0

def get_path_length(url):
    """Return the length of the path."""
    try:
        parsed_url = urllib.parse.urlparse(url)
        return len(parsed_url.path)
    except:
        return 0

def get_path_level(url):
    """Return the number of directories in the path."""
    try:
        parsed_url = urllib.parse.urlparse(url)
        return parsed_url.path.count('/')
    except:
        return 0

def get_subdomain_level(url):
    """Return the number of subdomains in the URL."""
    try:
        parsed_url = urllib.parse.urlparse(url)
        hostname = parsed_url.netloc
        if has_ip_address(url):
            return 0  # IP addresses don't have subdomains
        
        parts = hostname.split('.')
        # Remove top-level and second-level domains
        if len(parts) > 2:
            return len(parts) - 2  # Count remaining parts as subdomain levels
        else:
            return 0  # No subdomains
    except:
        return 0

def has_double_slash_in_path(url):
    """Check if the path contains a double slash."""
    try:
        parsed_url = urllib.parse.urlparse(url)
        return int('//' in parsed_url.path)
    except:
        return 0

def get_tld(url):
    """Extract the top-level domain from a URL."""
    try:
        parsed_url = urllib.parse.urlparse(url)
        hostname = parsed_url.netloc.lower()
        parts = hostname.split('.')
        if len(parts) > 1:
            return parts[-1]
        return ''
    except:
        return ''

def count_digits(url):
    """Count the number of digits in the URL."""
    return sum(c.isdigit() for c in url)

def digit_ratio(url):
    """Calculate the ratio of digits to the total URL length."""
    if len(url) == 0:
        return 0
    return count_digits(url) / len(url)

def count_letters(url):
    """Count the number of letters in the URL."""
    return sum(c.isalpha() for c in url)

def letter_ratio(url):
    """Calculate the ratio of letters to the total URL length."""
    if len(url) == 0:
        return 0
    return count_letters(url) / len(url)

def count_special_chars(url):
    """Count the number of special characters in the URL."""
    return sum(not c.isalnum() and not c.isspace() for c in url)

def special_char_ratio(url):
    """Calculate the ratio of special characters to the total URL length."""
    if len(url) == 0:
        return 0
    return count_special_chars(url) / len(url)

def get_query_length(url):
    """Return the length of the query string."""
    try:
        parsed_url = urllib.parse.urlparse(url)
        return len(parsed_url.query)
    except:
        return 0

def get_fragment_length(url):
    """Return the length of the fragment."""
    try:
        parsed_url = urllib.parse.urlparse(url)
        return len(parsed_url.fragment)
    except:
        return 0

def healthcare_relevance_score(url):
    """
    Calculate a relevance score for healthcare-related URLs.
    Higher scores indicate stronger relation to healthcare.
    """
    url_lower = url.lower()
    parsed_url = urllib.parse.urlparse(url_lower)
    domain = parsed_url.netloc
    path = parsed_url.path
    
    score = 0
    
    # Check for healthcare keywords in domain
    for keyword in HEALTHCARE_KEYWORDS:
        if keyword in domain:
            score += 3
        elif keyword in path:
            score += 1
    
    # Check for healthcare institutions
    for institution in HEALTHCARE_INSTITUTIONS:
        if institution in domain:
            score += 4
        elif institution in path:
            score += 2
    
    # Check for healthcare-specific domains and TLDs
    for healthcare_domain in HEALTHCARE_DOMAINS:
        if healthcare_domain in domain:
            score += 3
    
    # Check for EHR/patient portal indicators
    if 'portal' in domain or 'portal' in path:
        score += 2
    if 'patient' in domain or 'mychart' in domain:
        score += 3
    if 'ehr' in domain or 'emr' in domain:
        score += 3
    
    # Normalize score to be between 0 and 1
    return min(score / 10.0, 1.0)

def extract_features(url):
    """Extract all features from a given URL."""
    features = [
        # Core features (the original 17)
        num_dots(url),
        get_subdomain_level(url),
        get_path_level(url),
        url_length(url),
        num_hyphens(url),
        num_at(url),
        num_tilde(url),
        num_underscore(url),
        num_percent(url),
        num_ampersand(url),
        num_hash(url),
        has_https(url),
        has_ip_address(url),
        get_hostname_length(url),
        get_path_length(url),
        has_double_slash_in_path(url),
        
        # Additional features
        digit_ratio(url),
        letter_ratio(url),
        special_char_ratio(url),
        get_query_length(url),
        get_fragment_length(url),
        healthcare_relevance_score(url)
    ]
    return features

def get_feature_names():
    """Get names of all features in the order they are extracted."""
    return [
        'num_dots', 'subdomain_level', 'path_level', 'url_length',
        'num_hyphens', 'num_at', 'num_tilde', 'num_underscore', 
        'num_percent', 'num_ampersand', 'num_hash', 'has_https',
        'has_ip_address', 'hostname_length', 'path_length', 
        'double_slash_in_path', 'digit_ratio', 'letter_ratio',
        'special_char_ratio', 'query_length', 'fragment_length',
        'healthcare_relevance'
    ]

# --- Dataset Loading and Processing ---

class URLDataset(Dataset):
    def __init__(self, features, labels):
        """
        Custom PyTorch Dataset for URL features and labels.
        
        Args:
            features (numpy.ndarray): Feature vectors for each URL
            labels (numpy.ndarray): Labels for each URL (0 for benign, 1 for malicious)
        """
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

def load_huggingface_data(file_path):
    """
    Load the Hugging Face dataset from a JSON file.
    
    Args:
        file_path: Path to the JSON file
        
    Returns:
        List of tuples containing (url, label)
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    url_data = []
    for item in data:
        url = item.get('text', '')
        label = item.get('label', -1)
        if url and label != -1:  # Only add entries with valid URLs and labels
            url_data.append((url, label))
    
    print(f"Loaded {len(url_data)} URLs from Hugging Face dataset")
    return url_data

def load_phiusiil_data(file_path):
    """
    Load the PhiUSIIL dataset from a CSV file.
    
    Args:
        file_path: Path to the CSV file
        
    Returns:
        List of tuples containing (url, label)
    """
    df = pd.read_csv(file_path)
    
    url_data = []
    for _, row in df.iterrows():
        url = row['URL']
        label = row['label']
        if isinstance(url, str) and url.strip() and not pd.isna(label):
            url_data.append((url, label))
    
    print(f"Loaded {len(url_data)} URLs from PhiUSIIL dataset")
    return url_data

def load_kaggle_data(file_path):
    """
    Load the Kaggle malicious_phish.csv dataset.
    
    Args:
        file_path: Path to the CSV file
        
    Returns:
        List of tuples containing (url, label)
    """
    df = pd.read_csv(file_path)
    
    url_data = []
    for _, row in df.iterrows():
        url = row['url']
        type_val = row['type']
        
        # Convert to binary classification (0 for benign, 1 for all others)
        label = 0 if type_val.lower() == 'benign' else 1
        
        if isinstance(url, str) and url.strip():
            url_data.append((url, label))
    
    print(f"Loaded {len(url_data)} URLs from Kaggle dataset")
    return url_data

def combine_and_deduplicate(datasets):
    """
    Combine multiple datasets and remove duplicates by URL.
    
    Args:
        datasets: List of datasets, each containing (url, label) tuples
        
    Returns:
        Tuple of (urls, labels) with duplicates removed
    """
    url_to_label = {}
    
    # Process each dataset
    for dataset in datasets:
        for url, label in dataset:
            # If we've seen this URL before with a different label,
            # prefer the malicious label (1) for safety
            if url in url_to_label:
                url_to_label[url] = max(url_to_label[url], label)
            else:
                url_to_label[url] = label
    
    # Convert to lists
    urls = list(url_to_label.keys())
    labels = list(url_to_label.values())
    
    print(f"After deduplication: {len(urls)} unique URLs")
    
    # Report class distribution
    label_counts = Counter(labels)
    print(f"Class distribution - Benign (0): {label_counts[0]}, Malicious (1): {label_counts[1]}")
    
    return urls, labels

def extract_all_features(urls):
    """
    Extract features from a list of URLs.
    
    Args:
        urls: List of URL strings
        
    Returns:
        Numpy array of feature vectors
    """
    feature_vectors = []
    
    # Use tqdm for a progress bar
    for url in tqdm.tqdm(urls, desc="Extracting features"):
        try:
            features = extract_features(url)
            feature_vectors.append(features)
        except Exception as e:
            print(f"Error extracting features from {url}: {str(e)}")
            # Insert a vector of zeros in case of error
            feature_vectors.append([0] * len(get_feature_names()))
    
    return np.array(feature_vectors, dtype=np.float32)

# --- MLP Model ---
class PhishingMLP(nn.Module):
    def __init__(self, input_size=22, hidden_sizes=[22, 30, 10], output_size=1):
        """
        Multilayer Perceptron for Phishing URL Detection.
        
        Args:
            input_size: Number of input features (default: 22)
            hidden_sizes: List of neurons in each hidden layer
            output_size: Number of output classes (1 for binary)
        """
        super(PhishingMLP, self).__init__()
        
        self.layers = nn.ModuleList()
        
        # Input layer to first hidden layer
        self.layers.append(nn.Linear(input_size, hidden_sizes[0]))
        self.layers.append(nn.ReLU())
        
        # Hidden layers
        for i in range(len(hidden_sizes) - 1):
            self.layers.append(nn.Linear(hidden_sizes[i], hidden_sizes[i+1]))
            self.layers.append(nn.ReLU())
        
        # Output layer
        self.layers.append(nn.Linear(hidden_sizes[-1], output_size))
        self.layers.append(nn.Sigmoid())

    def forward(self, x):
        """Forward pass through the network."""
        for layer in self.layers:
            x = layer(x)
        return x

# --- Training Functions ---
def train_mlp(model, train_loader, val_loader, epochs=25, learning_rate=0.001, device="cpu"):
    """
    Train the MLP model.
    
    Args:
        model: The MLP model
        train_loader: DataLoader for training data
        val_loader: DataLoader for validation data
        epochs: Number of training epochs
        learning_rate: Learning rate for optimization
        device: Device to train on (cpu or cuda)
        
    Returns:
        Tuple of (trained_model, train_losses, val_losses, val_accuracies)
    """
    model.to(device)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    train_losses = []
    val_losses = []
    val_accuracies = []
    
    print(f"Training on {device}...")
    for epoch in range(epochs):
        # Training phase
        model.train()
        running_loss = 0.0
        
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            # Zero the parameter gradients
            optimizer.zero_grad()
            
            # Forward + backward + optimize
            outputs = model(inputs)
            loss = criterion(outputs, labels.unsqueeze(1).float())
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
        
        # Calculate average training loss
        epoch_train_loss = running_loss / len(train_loader)
        train_losses.append(epoch_train_loss)
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                
                # Calculate validation loss
                loss = criterion(outputs, labels.unsqueeze(1).float())
                val_loss += loss.item()
                
                # Calculate accuracy
                predicted = (outputs > 0.5).float()
                total += labels.size(0)
                correct += (predicted.squeeze() == labels.float()).sum().item()
        
        # Calculate average validation loss and accuracy
        epoch_val_loss = val_loss / len(val_loader)
        val_losses.append(epoch_val_loss)
        
        val_accuracy = 100 * correct / total
        val_accuracies.append(val_accuracy)
        
        # Print progress
        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {epoch_train_loss:.4f}, Val Loss: {epoch_val_loss:.4f}, Val Acc: {val_accuracy:.2f}%")
    
    return model, train_losses, val_losses, val_accuracies

def evaluate_model(model, test_loader, device):
    """
    Evaluate the trained model on test data.
    
    Args:
        model: Trained model
        test_loader: DataLoader for test data
        device: Device to evaluate on
        
    Returns:
        Tuple of (accuracy, precision, recall, f1_score)
    """
    model.to(device)
    model.eval()
    
    correct = 0
    total = 0
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    healthcare_correct = 0
    healthcare_total = 0
    
    feature_idx = get_feature_names().index('healthcare_relevance')
    healthcare_threshold = 0.5  # Threshold for considering a URL healthcare-related
    
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            # Forward pass
            outputs = model(inputs)
            predicted = (outputs > 0.5).float().squeeze()
            
            # Update counts
            total += labels.size(0)
            correct += (predicted == labels.float()).sum().item()
            
            # Metrics calculation
            for i in range(labels.size(0)):
                if labels[i] == 1 and predicted[i] == 1:
                    true_positives += 1
                elif labels[i] == 0 and predicted[i] == 1:
                    false_positives += 1
                elif labels[i] == 1 and predicted[i] == 0:
                    false_negatives += 1
                
                # Check healthcare relevance
                if inputs[i, feature_idx] >= healthcare_threshold:
                    healthcare_total += 1
                    if predicted[i] == labels[i]:
                        healthcare_correct += 1
    
    # Calculate metrics
    accuracy = 100 * correct / total
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0.0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0.0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
    
    # Healthcare-specific accuracy
    healthcare_accuracy = 100 * healthcare_correct / healthcare_total if healthcare_total > 0 else 0.0
    
    print(f"Overall Test Accuracy: {accuracy:.2f}%")
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")
    print(f"Healthcare URLs identified: {healthcare_total} ({healthcare_total/total*100:.2f}%)")
    print(f"Healthcare URL Detection Accuracy: {healthcare_accuracy:.2f}%")
    
    return accuracy, precision, recall, f1, healthcare_accuracy

def plot_training_results(train_losses, val_losses, val_accuracies):
    """
    Plot training metrics.
    
    Args:
        train_losses: List of training losses
        val_losses: List of validation losses
        val_accuracies: List of validation accuracies
    """
    plt.figure(figsize=(15, 5))
    
    # Plot losses
    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    
    # Plot accuracy
    plt.subplot(1, 2, 2)
    plt.plot(val_accuracies, label='Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy (%)')
    plt.title('Validation Accuracy')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig('training_results.png')
    plt.show()

def analyze_healthcare_features(features, labels, pred_labels):
    """
    Analyze how the model performs on healthcare-related URLs.
    
    Args:
        features: Feature vectors
        labels: True labels
        pred_labels: Predicted labels
    """
    healthcare_idx = get_feature_names().index('healthcare_relevance')
    healthcare_scores = features[:, healthcare_idx]
    
    # Define thresholds
    thresholds = [0.1, 0.3, 0.5, 0.7, 0.9]
    
    print("\n=== Healthcare URL Analysis ===")
    print("Healthcare relevance score distribution:")
    for threshold in thresholds:
        count = np.sum(healthcare_scores >= threshold)
        percent = (count / len(healthcare_scores)) * 100
        print(f"  Score >= {threshold}: {count} URLs ({percent:.2f}%)")

    # Analyze performance at different healthcare relevance levels
    for threshold in thresholds:
        mask = healthcare_scores >= threshold
        if np.sum(mask) == 0:
            continue
            
        h_labels = labels[mask]
        h_preds = pred_labels[mask]
        h_accuracy = np.mean(h_labels == h_preds) * 100
        
        benign_count = np.sum(h_labels == 0)
        malicious_count = np.sum(h_labels == 1)
        
        print(f"\nFor healthcare relevance >= {threshold}:")
        print(f"  URLs: {np.sum(mask)} ({benign_count} benign, {malicious_count} malicious)")
        print(f"  Accuracy: {h_accuracy:.2f}%")
        
        # Calculate healthcare-specific metrics
        tp = np.sum((h_labels == 1) & (h_preds == 1))
        fp = np.sum((h_labels == 0) & (h_preds == 1))
        fn = np.sum((h_labels == 1) & (h_preds == 0))
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        print(f"  Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
        
        # Calculate false positive rate for healthcare URLs
        if benign_count > 0:
            h_fpr = np.sum((h_labels == 0) & (h_preds == 1)) / benign_count
            print(f"  False Positive Rate: {h_fpr:.4f}")

        # Calculate false negative rate for healthcare URLs
        if malicious_count > 0:
            h_fnr = np.sum((h_labels == 1) & (h_preds == 0)) / malicious_count
            print(f"  False Negative Rate: {h_fnr:.4f}")

# --- Main Function ---
def main():
    """Main function to run the entire pipeline."""
    # Configuration
    batch_size = 32
    learning_rate = 0.001
    epochs = 20
    test_size = 0.2
    val_size = 0.2
    random_seed = 42
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # Filenames
    huggingface_file = "urls.json"
    phiusiil_file = "PhiUSIIL_Phishing_URL_Dataset.csv"
    kaggle_file = "malicious_phish.csv"
    
    # Load datasets
    print("Loading datasets...")
    huggingface_data = load_huggingface_data(huggingface_file)
    phiusiil_data = load_phiusiil_data(phiusiil_file)
    kaggle_data = load_kaggle_data(kaggle_file)
    
    # Combine and deduplicate datasets
    print("Combining and deduplicating datasets...")
    urls, labels = combine_and_deduplicate([huggingface_data, phiusiil_data, kaggle_data])
    
    # Extract features
    print("Extracting features...")
    features = extract_all_features(urls)
    
    # Split into train, validation, and test sets
    print("Splitting data...")
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        features, labels, test_size=test_size, random_state=random_seed, stratify=labels
    )
    
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=val_size/(1-test_size), 
        random_state=random_seed, stratify=y_train_val
    )
    
    # Standardize features
    print("Standardizing features...")
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)
    
    # Create PyTorch datasets and dataloaders
    print("Creating DataLoaders...")
    train_dataset = URLDataset(X_train, y_train)
    val_dataset = URLDataset(X_val, y_val)
    test_dataset = URLDataset(X_test, y_test)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    # Initialize and train model
    print("Initializing model...")
    input_size = features.shape[1]  # Number of features
    model = PhishingMLP(input_size=input_size)
    
    print("Training model...")
    trained_model, train_losses, val_losses, val_accuracies = train_mlp(
        model, train_loader, val_loader, epochs=epochs, 
        learning_rate=learning_rate, device=device
    )
    
    # Save trained model
    print("Saving model...")
    model_path = "phishing_mlp_model.pth"
    torch.save(trained_model.state_dict(), model_path)
    print(f"Model saved to {model_path}")
    
    # Evaluate on test set
    print("\nEvaluating model on test set...")
    acc, prec, rec, f1, healthcare_acc = evaluate_model(trained_model, test_loader, device)
    
    # Plot results
    plot_training_results(train_losses, val_losses, val_accuracies)
    
    # Further healthcare analysis
    y_pred = []
    trained_model.eval()
    with torch.no_grad():
        for inputs, _ in test_loader:
            inputs = inputs.to(device)
            outputs = trained_model(inputs)
            predicted = (outputs > 0.5).float().squeeze().cpu().numpy()
            y_pred.extend(predicted.tolist())
    
    analyze_healthcare_features(X_test, np.array(y_test), np.array(y_pred))
    
    # Print feature importance summary
    feature_names = get_feature_names()
    healthcare_idx = feature_names.index('healthcare_relevance')
    healthcare_scores = features[:, healthcare_idx]
    high_healthcare = healthcare_scores >= 0.5
    
    print("\n=== Healthcare URL Examples ===")
    high_healthcare_indices = np.where(high_healthcare)[0][:5]  # Get first 5 indices
    for idx in high_healthcare_indices:
        print(f"URL: {urls[idx]}")
        print(f"Healthcare Score: {healthcare_scores[idx]:.2f}")
        print(f"Label: {'Malicious' if labels[idx] == 1 else 'Benign'}")
        print()
    
    # Summary
    print("\n=== Summary ===")
    print(f"Total URLs processed: {len(urls)}")
    print(f"Training set: {len(X_train)} URLs")
    print(f"Validation set: {len(X_val)} URLs")
    print(f"Test set: {len(X_test)} URLs")
    print(f"Model input features: {input_size}")
    print(f"Test Accuracy: {acc:.2f}%")
    print(f"Healthcare URL Accuracy: {healthcare_acc:.2f}%")
    print(f"Precision: {prec:.4f}, Recall: {rec:.4f}, F1-Score: {f1:.4f}")
    print("\nTraining complete!")

if __name__ == "__main__":
    main()