web-phishing-detection

Sleeping

App Files Files Community

rmdhirr commited on Jun 16, 2024

Commit

8b45928

verified ·

1 Parent(s): 37db18f

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -29

app.py CHANGED Viewed

@@ -25,32 +25,18 @@ nltk.download('wordnet')
 STOPWORDS = set(stopwords.words('english'))
 lemmatizer = WordNetLemmatizer()
-def normalize_length(url, target_length=50):
-    if len(url) < target_length:
-        url = url + " " * (target_length - len(url))
-    else:
-        url = url[:target_length]
-    return url
-def preprocess_url(url):
-    url = url.lower()
-    url = re.sub(r'https?://', '', url)
-    url = re.sub(r'www\.', '', url)
-    url = re.sub(r'[^a-zA-Z0-9]', ' ', url)
-    url = re.sub(r'\s+', ' ', url).strip()
-    url = normalize_length(url)
-    tokens = word_tokenize(url)
-    tokens = [word for word in tokens if word not in STOPWORDS]
-    tokens = [lemmatizer.lemmatize(word) for word in tokens]
-    return ' '.join(tokens)
-def preprocess_html(html):
-    html = re.sub(r'<[^>]+>', ' ', html)
-    html = html.lower()
-    html = re.sub(r'https?://', '', html)
-    html = re.sub(r'[^a-zA-Z0-9]', ' ', html)
-    html = re.sub(r'\s+', ' ', html).strip()
-    tokens = word_tokenize(html)
     tokens = [word for word in tokens if word not in STOPWORDS]
     tokens = [lemmatizer.lemmatize(word) for word in tokens]
     return ' '.join(tokens)
@@ -73,20 +59,25 @@ def preprocess_input(input_text, tokenizer, max_length):
 def get_prediction(input_text, input_type):
     is_url = input_type == "URL"
     if is_url:
-        cleaned_text = preprocess_url(input_text)
         input_data = preprocess_input(cleaned_text, url_tokenizer, max_url_length)
         input_data = [input_data, np.zeros((1, max_html_length))]  # dummy HTML input
     else:
-        cleaned_text = preprocess_html(input_text)
         input_data = preprocess_input(cleaned_text, html_tokenizer, max_html_length)
         input_data = [np.zeros((1, max_url_length)), input_data]  # dummy URL input
     prediction = model.predict(input_data)[0][0]
     return prediction
 def phishing_detection(input_text, input_type):
-    prediction = get_prediction(input_text, input_type)
-    threshold = 0.5  # Adjusted threshold
     if prediction > threshold:
         return f"Warning: This site is likely a phishing site! ({prediction:.2f})"
     else:

 STOPWORDS = set(stopwords.words('english'))
 lemmatizer = WordNetLemmatizer()
+def normalize_length(text, target_length):
+    text = text[:target_length].ljust(target_length)
+    return text
+def preprocess_text(text, is_url=True):
+    text = text.lower()
+    if is_url:
+        text = re.sub(r'https?://', '', text)
+        text = re.sub(r'www\.', '', text)
+    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
+    text = re.sub(r'\s+', ' ', text).strip()
+    tokens = word_tokenize(text)
     tokens = [word for word in tokens if word not in STOPWORDS]
     tokens = [lemmatizer.lemmatize(word) for word in tokens]
     return ' '.join(tokens)
 def get_prediction(input_text, input_type):
     is_url = input_type == "URL"
     if is_url:
+        cleaned_text = preprocess_text(input_text, is_url=True)
         input_data = preprocess_input(cleaned_text, url_tokenizer, max_url_length)
         input_data = [input_data, np.zeros((1, max_html_length))]  # dummy HTML input
     else:
+        cleaned_text = preprocess_text(input_text, is_url=False)
         input_data = preprocess_input(cleaned_text, html_tokenizer, max_html_length)
         input_data = [np.zeros((1, max_url_length)), input_data]  # dummy URL input
     prediction = model.predict(input_data)[0][0]
     return prediction
+def ensemble_prediction(input_text, input_type, n_ensemble=5):
+    predictions = [get_prediction(input_text, input_type) for _ in range(n_ensemble)]
+    avg_prediction = np.mean(predictions)
+    return avg_prediction
 def phishing_detection(input_text, input_type):
+    prediction = ensemble_prediction(input_text, input_type)
+    threshold = 0.5  # Keep the threshold unchanged
     if prediction > threshold:
         return f"Warning: This site is likely a phishing site! ({prediction:.2f})"
     else: