Spaces:
Runtime error
Runtime error
File size: 5,603 Bytes
01cbf36 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
from modules import *
from pathlib import Path
import pandas as pd
from flask import Flask, render_template, request
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from joblib import load
import sklearn
import ssl
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
pass
else:
ssl._create_default_https_context = _create_unverified_https_context
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('omw-1.4')
# nltk.download('wordnet')
def check_file_type(file):
file_extension = Path(file.filename).suffix.lower()
if file_extension == '.eml' or file_extension == '.txt':
save_file(file)
return 'Extracted Features'
# return get_features('email files/' + file.filename)
else:
return "Please select .eml or .txt file."
def save_file(file):
file_path = 'email files/' + file.filename
with open(file_path, 'w') as f:
f.write(file.read().decode('utf-8'))
def text_feature(filepath):
text = get_text(filepath)
# print(text)
if text != "":
text = text.split()
textlist = ' '.join(text)
dataf = pd.DataFrame([[textlist]], columns=['text'])
return dataf
def html_tags_feature(filepath):
tags = get_tags_from_html(get_html_general(filepath))
taglist = ' '.join(tags) if tags !=[] else []
dataf = pd.DataFrame([[taglist]], columns=['tags'])
return dataf
def extra_feature(filepath):
spf = check_spf(filepath)
dkim = check_dkim(filepath)
dmarc = check_dmarc(filepath)
deliver_receiver = check_deliver_receiver(filepath)
encript = check_encript(filepath)
onclick = get_onclicks(filepath)
popwindow = check_popWindow(filepath)
extra_data_row = [spf, dkim, dmarc, deliver_receiver, encript, onclick, popwindow]
extra_data_row = [0 if x is None else x for x in extra_data_row]
extra_data_row = [1 if x is True else x for x in extra_data_row]
extra_data_row = [0 if x is False else x for x in extra_data_row]
extra_data = pd.DataFrame([extra_data_row],
columns=['SPF(Pass:1,Neutral:2,Softdail:3,None:0)', 'DKIM', 'DMARC', 'Deliver-to Matches Receiver', 'Message_encrtpted', 'Onclick_events', 'Popwindow'])
return extra_data
def num_feature(filepath):
body_richness = get_body_richness(filepath)
func_words = get_num_FunctionWords(filepath)
sbj_richness = get_sbj_richness(filepath)
urls = get_num_urls(filepath)
ipurls = get_num_urls_ip(filepath)
imageurls = get_num_image_urls(filepath)
domainurls = get_num_domain_urls(filepath)
urlport = get_num_url_ports(filepath)
sen_chars = get_chars_sender(filepath)
num_data_row = [body_richness, func_words, sbj_richness, urls, ipurls, imageurls, domainurls, urlport, sen_chars]
num_data_row = [0 if x is None else x for x in num_data_row]
num_data = pd.DataFrame([num_data_row],
columns=['body richness', 'Include function words', 'Subject richness', 'Numers of URLs', 'IPURLs', 'ImageURLs',
'DomainURLs', 'URLs contain port information', 'Characters in senders'])
return num_data
def get_features(filepath):
# text
textlist = text_feature(filepath)
# html tags
taglist = html_tags_feature(filepath)
#extra feature
extra_data = extra_feature(filepath)
# Numeric data
num_data = num_feature(filepath)
combined_df = pd.concat([textlist, taglist, num_data,extra_data], axis=1)
# print(combined_df)
return combined_df
def predict_content(content):
content_clf = load("save_models/SVM_finalcontent.pkl")
predict = content_clf.predict(preprocess_content(content))
return "Legitimate" if predict[0]=='ham' else "Phishing"
def predict_html(html_tag):
html_clf = load("save_models/Stack_tag.pkl")
predict = html_clf.predict(preprocess_html(html_tag))
return "Legitimate" if predict[0]=='ham' else "Phishing"
def predict_num(num_df):
num_clf = load("save_models/RF_Num.pkl")
predict = num_clf.predict(preprocess_num(num_df))
return "Legitimate" if predict[0]=='ham' else "Phishing"
def predict_extra(extra_df):
extra_clf = load("save_models/RF_extra.pkl")
predict = extra_clf.predict(preprocess_extra(extra_df))
return "Legitimate" if predict[0]=='ham' else "Phishing"
def preprocess_content(content):
with open('vectorizer/content_tfidf.pickle', 'rb') as f:
tfidf = pickle.load(f)
# Transform feature input to TF-IDF
content_tfidf = tfidf.transform(content)
return content_tfidf
def preprocess_html(html_tag):
with open('vectorizer/html_cv.pickle', 'rb') as f:
cv = pickle.load(f)
tag_data = cv.transform(html_tag)
return tag_data
def preprocess_num(num_df):
with open('vectorizer/num_scaler.pkl', 'rb') as f:
num_scaler = pickle.load(f)
scale_num = num_scaler.transform(num_df.values)
return scale_num
def preprocess_extra(extra_df):
with open('vectorizer/extra_scaler.pkl', 'rb') as f:
extra_scaler = pickle.load(f)
scale_extra = extra_scaler.transform(extra_df.values)
return scale_extra
lemmatizer = WordNetLemmatizer()
def customtokenize(str):
# Split string as tokens
tokens = nltk.word_tokenize(str)
# Filter for stopwords
nostop = list(filter(lambda token: token not in stopwords.words('english'), tokens))
# Perform lemmatization
lemmatized = [lemmatizer.lemmatize(word) for word in nostop]
return lemmatized |