Spaces:
Sleeping
Sleeping
File size: 5,220 Bytes
eb30cad b1ddb38 8cd35aa b1ddb38 8af0aaf b1ddb38 eb30cad 2f8164c eb30cad 2f8164c eb30cad 2f8164c eb30cad b1ddb38 8af0aaf e2e2b90 8af0aaf b1ddb38 8af0aaf b1ddb38 8af0aaf b1ddb38 eb30cad b1ddb38 eb30cad 37db18f 8af0aaf 37db18f 8af0aaf 37db18f a04dd4c 37db18f eb30cad 8b45928 37db18f 8b45928 37db18f 8cd35aa eb30cad 8cd35aa eb30cad 120d185 37db18f 120d185 2f8164c 120d185 eb30cad a664f59 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import gradio as gr
import tensorflow as tf
import numpy as np
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
import re
# Load the model
model = tf.keras.models.load_model('new_phishing_detection_model.keras')
# Compile the model with standard loss and metrics
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
loss='binary_crossentropy',
metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
# Preprocessing functions
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
STOPWORDS = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess_url(url):
url = url.lower()
url = re.sub(r'https?://', '', url)
url = re.sub(r'www\.', '', url)
url = re.sub(r'[^a-zA-Z0-9]', ' ', url)
url = re.sub(r'\s+', ' ', url).strip()
tokens = word_tokenize(url)
tokens = [word for word in tokens if word not in STOPWORDS]
tokens = [lemmatizer.lemmatize(word) for word in tokens]
return ' '.join(tokens)
def preprocess_html(html):
html = re.sub(r'<[^>]+>', ' ', html)
html = html.lower()
html = re.sub(r'https?://', '', html)
html = re.sub(r'[^a-zA-Z0-9]', ' ', html)
html = re.sub(r'\s+', ' ', html).strip()
tokens = word_tokenize(html)
tokens = [word for word in tokens if word not in STOPWORDS]
tokens = [lemmatizer.lemmatize(word) for word in tokens]
return ' '.join(tokens)
# Define maximum lengths
max_url_length = 180
max_html_length = 2000
max_words = 10000
# Load datasets
url_df = pd.read_csv('url_data.csv')
html_df = pd.read_csv('html_data.csv')
# Clean URL 'Data' Columns
url_df['Cleaned_Data'] = url_df['Data'].apply(preprocess_url)
# Clean HTML 'Data' Columns
html_df['Cleaned_Data'] = html_df['Data'].apply(preprocess_html)
# URL Tokenization and Padding
url_tokenizer = Tokenizer(num_words=max_words, char_level=True)
url_tokenizer.fit_on_texts(url_df['Cleaned_Data'])
url_sequences = url_tokenizer.texts_to_sequences(url_df['Cleaned_Data'])
url_padded = pad_sequences(url_sequences, maxlen=max_url_length, padding='post', truncating='post')
# HTML Tokenization and Padding
html_tokenizer = Tokenizer(num_words=max_words)
html_tokenizer.fit_on_texts(html_df['Cleaned_Data'])
html_sequences = html_tokenizer.texts_to_sequences(html_df['Cleaned_Data'])
html_padded = pad_sequences(html_sequences, maxlen=max_html_length, padding='post', truncating='post')
# Encode 'Category' Column
label_encoder = LabelEncoder()
url_df['Category_Encoded'] = label_encoder.fit_transform(url_df['Category'])
html_df['Category_Encoded'] = label_encoder.transform(html_df['Category'])
# Split datasets into training and testing sets
url_X_train, url_X_test, url_y_train, url_y_test = train_test_split(url_padded, url_df['Category_Encoded'], test_size=0.2, random_state=42)
html_X_train, html_X_test, html_y_train, html_y_test = train_test_split(html_padded, html_df['Category_Encoded'], test_size=0.2, random_state=42)
def preprocess_input(input_text, tokenizer, max_length):
sequences = tokenizer.texts_to_sequences([input_text])
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
return padded_sequences
def get_prediction(input_text, input_type):
is_url = input_type == "URL"
if is_url:
cleaned_text = preprocess_url(input_text)
input_data = preprocess_input(cleaned_text, url_tokenizer, max_url_length)
input_data = [input_data, np.zeros((1, max_html_length))] # dummy HTML input
else:
cleaned_text = preprocess_html(input_text)
input_data = preprocess_input(cleaned_text, html_tokenizer, max_html_length)
input_data = [np.zeros((1, max_url_length)), input_data] # dummy URL input
prediction = model.predict(input_data)[0][0]
return prediction
def ensemble_prediction(input_text, input_type, n_ensemble=5):
predictions = [get_prediction(input_text, input_type) for _ in range(n_ensemble)]
avg_prediction = np.mean(predictions)
return avg_prediction
def phishing_detection(input_text, input_type):
prediction = ensemble_prediction(input_text, input_type)
threshold = 0.5 # Keep the threshold unchanged
if prediction > threshold:
return f"Warning: This site is likely a phishing site! ({prediction:.2f})"
else:
return f"Safe: This site is not likely a phishing site. ({prediction:.2f})"
iface = gr.Interface(
fn=phishing_detection,
inputs=[
gr.components.Textbox(lines=5, placeholder="Enter URL or HTML code"),
gr.components.Radio(["URL", "HTML"], type="value", label="Input Type")
],
outputs=gr.components.Textbox(label="Phishing Detection Result"),
title="Phishing Detection Model",
description="Check if a URL or HTML is Phishing.",
theme="default"
)
iface.launch() |