Spaces:
Runtime error
Runtime error
allenchienxxx
commited on
Commit
•
01cbf36
1
Parent(s):
0cc4e8f
Upload 13 files
Browse files- analze.py +154 -0
- main.py +34 -0
- modules.py +377 -0
- save_models/RF_Num.pkl +3 -0
- save_models/RF_extra.pkl +3 -0
- save_models/SVM_finalcontent.pkl +3 -0
- save_models/Stack_tag.pkl +3 -0
- static/css/styles.css +86 -0
- templates/home.html +37 -0
- vectorizer/content_tfidf.pickle +3 -0
- vectorizer/extra_scaler.pkl +3 -0
- vectorizer/html_cv.pickle +3 -0
- vectorizer/num_scaler.pkl +3 -0
analze.py
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from modules import *
|
2 |
+
from pathlib import Path
|
3 |
+
import pandas as pd
|
4 |
+
from flask import Flask, render_template, request
|
5 |
+
import nltk
|
6 |
+
import pickle
|
7 |
+
from nltk.corpus import stopwords
|
8 |
+
from nltk.stem import WordNetLemmatizer
|
9 |
+
from joblib import load
|
10 |
+
import sklearn
|
11 |
+
import ssl
|
12 |
+
|
13 |
+
try:
|
14 |
+
_create_unverified_https_context = ssl._create_unverified_context
|
15 |
+
except AttributeError:
|
16 |
+
pass
|
17 |
+
else:
|
18 |
+
ssl._create_default_https_context = _create_unverified_https_context
|
19 |
+
# nltk.download('stopwords')
|
20 |
+
# nltk.download('punkt')
|
21 |
+
# nltk.download('omw-1.4')
|
22 |
+
# nltk.download('wordnet')
|
23 |
+
|
24 |
+
def check_file_type(file):
|
25 |
+
file_extension = Path(file.filename).suffix.lower()
|
26 |
+
if file_extension == '.eml' or file_extension == '.txt':
|
27 |
+
save_file(file)
|
28 |
+
return 'Extracted Features'
|
29 |
+
# return get_features('email files/' + file.filename)
|
30 |
+
else:
|
31 |
+
return "Please select .eml or .txt file."
|
32 |
+
|
33 |
+
def save_file(file):
|
34 |
+
file_path = 'email files/' + file.filename
|
35 |
+
with open(file_path, 'w') as f:
|
36 |
+
f.write(file.read().decode('utf-8'))
|
37 |
+
|
38 |
+
def text_feature(filepath):
|
39 |
+
text = get_text(filepath)
|
40 |
+
# print(text)
|
41 |
+
if text != "":
|
42 |
+
text = text.split()
|
43 |
+
textlist = ' '.join(text)
|
44 |
+
dataf = pd.DataFrame([[textlist]], columns=['text'])
|
45 |
+
return dataf
|
46 |
+
|
47 |
+
def html_tags_feature(filepath):
|
48 |
+
tags = get_tags_from_html(get_html_general(filepath))
|
49 |
+
taglist = ' '.join(tags) if tags !=[] else []
|
50 |
+
dataf = pd.DataFrame([[taglist]], columns=['tags'])
|
51 |
+
return dataf
|
52 |
+
|
53 |
+
def extra_feature(filepath):
|
54 |
+
spf = check_spf(filepath)
|
55 |
+
dkim = check_dkim(filepath)
|
56 |
+
dmarc = check_dmarc(filepath)
|
57 |
+
deliver_receiver = check_deliver_receiver(filepath)
|
58 |
+
encript = check_encript(filepath)
|
59 |
+
onclick = get_onclicks(filepath)
|
60 |
+
popwindow = check_popWindow(filepath)
|
61 |
+
extra_data_row = [spf, dkim, dmarc, deliver_receiver, encript, onclick, popwindow]
|
62 |
+
extra_data_row = [0 if x is None else x for x in extra_data_row]
|
63 |
+
extra_data_row = [1 if x is True else x for x in extra_data_row]
|
64 |
+
extra_data_row = [0 if x is False else x for x in extra_data_row]
|
65 |
+
extra_data = pd.DataFrame([extra_data_row],
|
66 |
+
columns=['SPF(Pass:1,Neutral:2,Softdail:3,None:0)', 'DKIM', 'DMARC', 'Deliver-to Matches Receiver', 'Message_encrtpted', 'Onclick_events', 'Popwindow'])
|
67 |
+
return extra_data
|
68 |
+
|
69 |
+
def num_feature(filepath):
|
70 |
+
body_richness = get_body_richness(filepath)
|
71 |
+
func_words = get_num_FunctionWords(filepath)
|
72 |
+
sbj_richness = get_sbj_richness(filepath)
|
73 |
+
urls = get_num_urls(filepath)
|
74 |
+
ipurls = get_num_urls_ip(filepath)
|
75 |
+
imageurls = get_num_image_urls(filepath)
|
76 |
+
domainurls = get_num_domain_urls(filepath)
|
77 |
+
urlport = get_num_url_ports(filepath)
|
78 |
+
sen_chars = get_chars_sender(filepath)
|
79 |
+
num_data_row = [body_richness, func_words, sbj_richness, urls, ipurls, imageurls, domainurls, urlport, sen_chars]
|
80 |
+
num_data_row = [0 if x is None else x for x in num_data_row]
|
81 |
+
num_data = pd.DataFrame([num_data_row],
|
82 |
+
columns=['body richness', 'Include function words', 'Subject richness', 'Numers of URLs', 'IPURLs', 'ImageURLs',
|
83 |
+
'DomainURLs', 'URLs contain port information', 'Characters in senders'])
|
84 |
+
return num_data
|
85 |
+
def get_features(filepath):
|
86 |
+
# text
|
87 |
+
textlist = text_feature(filepath)
|
88 |
+
# html tags
|
89 |
+
taglist = html_tags_feature(filepath)
|
90 |
+
#extra feature
|
91 |
+
extra_data = extra_feature(filepath)
|
92 |
+
# Numeric data
|
93 |
+
|
94 |
+
num_data = num_feature(filepath)
|
95 |
+
combined_df = pd.concat([textlist, taglist, num_data,extra_data], axis=1)
|
96 |
+
# print(combined_df)
|
97 |
+
return combined_df
|
98 |
+
|
99 |
+
|
100 |
+
def predict_content(content):
|
101 |
+
content_clf = load("save_models/SVM_finalcontent.pkl")
|
102 |
+
predict = content_clf.predict(preprocess_content(content))
|
103 |
+
return "Legitimate" if predict[0]=='ham' else "Phishing"
|
104 |
+
|
105 |
+
def predict_html(html_tag):
|
106 |
+
html_clf = load("save_models/Stack_tag.pkl")
|
107 |
+
predict = html_clf.predict(preprocess_html(html_tag))
|
108 |
+
return "Legitimate" if predict[0]=='ham' else "Phishing"
|
109 |
+
|
110 |
+
def predict_num(num_df):
|
111 |
+
num_clf = load("save_models/RF_Num.pkl")
|
112 |
+
predict = num_clf.predict(preprocess_num(num_df))
|
113 |
+
return "Legitimate" if predict[0]=='ham' else "Phishing"
|
114 |
+
|
115 |
+
def predict_extra(extra_df):
|
116 |
+
extra_clf = load("save_models/RF_extra.pkl")
|
117 |
+
predict = extra_clf.predict(preprocess_extra(extra_df))
|
118 |
+
return "Legitimate" if predict[0]=='ham' else "Phishing"
|
119 |
+
|
120 |
+
def preprocess_content(content):
|
121 |
+
with open('vectorizer/content_tfidf.pickle', 'rb') as f:
|
122 |
+
tfidf = pickle.load(f)
|
123 |
+
# Transform feature input to TF-IDF
|
124 |
+
content_tfidf = tfidf.transform(content)
|
125 |
+
return content_tfidf
|
126 |
+
|
127 |
+
def preprocess_html(html_tag):
|
128 |
+
with open('vectorizer/html_cv.pickle', 'rb') as f:
|
129 |
+
cv = pickle.load(f)
|
130 |
+
tag_data = cv.transform(html_tag)
|
131 |
+
return tag_data
|
132 |
+
|
133 |
+
def preprocess_num(num_df):
|
134 |
+
with open('vectorizer/num_scaler.pkl', 'rb') as f:
|
135 |
+
num_scaler = pickle.load(f)
|
136 |
+
scale_num = num_scaler.transform(num_df.values)
|
137 |
+
return scale_num
|
138 |
+
|
139 |
+
def preprocess_extra(extra_df):
|
140 |
+
with open('vectorizer/extra_scaler.pkl', 'rb') as f:
|
141 |
+
extra_scaler = pickle.load(f)
|
142 |
+
scale_extra = extra_scaler.transform(extra_df.values)
|
143 |
+
return scale_extra
|
144 |
+
|
145 |
+
|
146 |
+
lemmatizer = WordNetLemmatizer()
|
147 |
+
def customtokenize(str):
|
148 |
+
# Split string as tokens
|
149 |
+
tokens = nltk.word_tokenize(str)
|
150 |
+
# Filter for stopwords
|
151 |
+
nostop = list(filter(lambda token: token not in stopwords.words('english'), tokens))
|
152 |
+
# Perform lemmatization
|
153 |
+
lemmatized = [lemmatizer.lemmatize(word) for word in nostop]
|
154 |
+
return lemmatized
|
main.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from analze import *
|
2 |
+
|
3 |
+
app = Flask(__name__)
|
4 |
+
|
5 |
+
@app.route('/')
|
6 |
+
def home():
|
7 |
+
return render_template('home.html')
|
8 |
+
|
9 |
+
|
10 |
+
@app.route('/upload', methods=['GET', 'POST'])
|
11 |
+
def upload_file():
|
12 |
+
if request.method == 'POST':
|
13 |
+
# Check if a file was uploaded
|
14 |
+
if 'file' not in request.files:
|
15 |
+
return render_template('home.html', content='No file uploaded.')
|
16 |
+
file = request.files['file']
|
17 |
+
# Check if the file has a filename
|
18 |
+
if file.filename == '':
|
19 |
+
return render_template('home.html', content='No file selected.')
|
20 |
+
filepath = 'email files/' + file.filename
|
21 |
+
return render_template('home.html',
|
22 |
+
content=check_file_type(file),
|
23 |
+
features = get_features(filepath),
|
24 |
+
pre_content=predict_content(text_feature(filepath)),
|
25 |
+
pre_tag=predict_html(html_tags_feature(filepath)),
|
26 |
+
pre_num=predict_num(num_feature(filepath)),
|
27 |
+
pre_extra=predict_extra(extra_feature(filepath)))
|
28 |
+
|
29 |
+
return render_template('home.html')
|
30 |
+
|
31 |
+
|
32 |
+
|
33 |
+
if __name__ == '__main__':
|
34 |
+
app.run(host='0.0.0.0', port=8000)
|
modules.py
ADDED
@@ -0,0 +1,377 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def get_text_from_html(html_content):
|
2 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
3 |
+
# extract all the texts
|
4 |
+
all_text = soup.get_text()
|
5 |
+
all_text = re.sub(r"\s+", " ", all_text)
|
6 |
+
# print(all_text)
|
7 |
+
return all_text
|
8 |
+
# get text content type from email
|
9 |
+
def get_text(file_path):
|
10 |
+
with open(file_path, 'rb') as file:
|
11 |
+
message = email.message_from_bytes(file.read())
|
12 |
+
text_content = ""
|
13 |
+
for part in message.walk():
|
14 |
+
if part.get_content_type() == 'text/plain':
|
15 |
+
text_content += part.get_payload(decode=True).decode('iso-8859-1')
|
16 |
+
# print(text_content)
|
17 |
+
return text_content.replace("\n","")
|
18 |
+
if text_content == "":
|
19 |
+
return get_text_from_html(get_html_general(file_path));
|
20 |
+
from bs4 import BeautifulSoup
|
21 |
+
import email
|
22 |
+
def get_email_html(file_path):
|
23 |
+
with open(file_path, 'rb') as file:
|
24 |
+
content = email.message_from_bytes(file.read())
|
25 |
+
html_content = ""
|
26 |
+
for part in content.walk():
|
27 |
+
if part.get_content_type() == 'text/html':
|
28 |
+
html_content += part.get_payload(decode=True).decode('iso-8859-1')
|
29 |
+
html_content.replace("\n","")
|
30 |
+
if html_content != "":
|
31 |
+
# print("Found html at "+file_path)
|
32 |
+
return html_content
|
33 |
+
else:
|
34 |
+
# print("No html content found at "+file_path)
|
35 |
+
return ""
|
36 |
+
|
37 |
+
#get html by searching for <html> tag
|
38 |
+
def get_html(file_path):
|
39 |
+
with open(file_path, 'r',encoding='iso-8859-1') as file:
|
40 |
+
html_flag = False
|
41 |
+
html_content = "";
|
42 |
+
tag_list = []
|
43 |
+
for line in file:
|
44 |
+
words = line.split()
|
45 |
+
for word in words:
|
46 |
+
if word == "<html>":
|
47 |
+
html_flag = True;
|
48 |
+
if html_flag:
|
49 |
+
html_content += word
|
50 |
+
if word == "</html>":
|
51 |
+
html_flag = False;
|
52 |
+
# print(html_content)
|
53 |
+
html_content.replace("\n","")
|
54 |
+
if html_content == "":
|
55 |
+
# print("No html content found at "+file_path)
|
56 |
+
return ""
|
57 |
+
else:
|
58 |
+
# print("Found html at "+file_path)
|
59 |
+
return html_content
|
60 |
+
|
61 |
+
def get_html_general(file_path):
|
62 |
+
if get_email_html(file_path)!="":
|
63 |
+
return get_email_html(file_path)
|
64 |
+
else:
|
65 |
+
return get_html(file_path)
|
66 |
+
def get_onclicks(file_path):
|
67 |
+
content = get_html_general(file_path)
|
68 |
+
if content == "": return None
|
69 |
+
soup = BeautifulSoup(content, 'html.parser')
|
70 |
+
|
71 |
+
elements = soup.find_all(attrs={'onClick': True})
|
72 |
+
# Count the number of elements with an onClick attribute
|
73 |
+
count = len(elements)
|
74 |
+
return count
|
75 |
+
def check_popWindow(file_path):
|
76 |
+
content = get_html_general(file_path)
|
77 |
+
if content == "": return None
|
78 |
+
soup = BeautifulSoup(content, 'html.parser')
|
79 |
+
|
80 |
+
# Check if any <script> tags were found
|
81 |
+
try:
|
82 |
+
scripts = soup.find_all('script', text=lambda text: 'window.open' in text)
|
83 |
+
if scripts:
|
84 |
+
return True
|
85 |
+
# print('The email body contains a script that attempts to modify the status bar.')
|
86 |
+
else:
|
87 |
+
# print('The email body does not contain a script that attempts to modify the status bar.')
|
88 |
+
return False
|
89 |
+
except TypeError:
|
90 |
+
return False
|
91 |
+
|
92 |
+
def check_spf(file_path):
|
93 |
+
with open(file_path, 'rb') as file:
|
94 |
+
message = email.message_from_bytes(file.read())
|
95 |
+
received_spf_header = message.get('Received-SPF')
|
96 |
+
if received_spf_header == None:
|
97 |
+
return 0
|
98 |
+
if received_spf_header:
|
99 |
+
spf_result = received_spf_header.split()[0].lower()
|
100 |
+
if spf_result == 'pass':
|
101 |
+
return 1
|
102 |
+
elif spf_result == 'neutral':
|
103 |
+
return 2
|
104 |
+
elif spf_result == 'softfail':
|
105 |
+
return 3
|
106 |
+
else:
|
107 |
+
return 0
|
108 |
+
else:
|
109 |
+
return 0
|
110 |
+
def check_dkim(file_path):
|
111 |
+
with open(file_path, 'rb') as file:
|
112 |
+
message = email.message_from_bytes(file.read())
|
113 |
+
auth = message.get('Authentication-Results')
|
114 |
+
if auth == None:
|
115 |
+
return 0
|
116 |
+
auth_result = auth.split()
|
117 |
+
# print(auth)
|
118 |
+
# print(dkim_result)
|
119 |
+
if 'dkim=pass' in auth_result:
|
120 |
+
return 1
|
121 |
+
else:
|
122 |
+
return 0
|
123 |
+
def check_dmarc(file_path):
|
124 |
+
with open(file_path, 'rb') as file:
|
125 |
+
message = email.message_from_bytes(file.read())
|
126 |
+
auth = message.get('Authentication-Results')
|
127 |
+
if auth == None:
|
128 |
+
return 0
|
129 |
+
auth_result = auth.split()
|
130 |
+
# print(auth)
|
131 |
+
# print(dkim_result)
|
132 |
+
if 'dmarc=pass' in auth_result:
|
133 |
+
return 1
|
134 |
+
else:
|
135 |
+
return 0
|
136 |
+
def check_deliver_receiver(filepath):
|
137 |
+
with open(filepath, 'rb') as file:
|
138 |
+
message = email.message_from_bytes(file.read())
|
139 |
+
deliver = message.get('Delivered-To')
|
140 |
+
# print(deliver)
|
141 |
+
receiver = message.get('To')
|
142 |
+
# print(receiver)
|
143 |
+
if deliver == receiver:
|
144 |
+
return 1
|
145 |
+
else:
|
146 |
+
return 0
|
147 |
+
def check_encript(filepath):
|
148 |
+
with open(filepath, 'rb') as file:
|
149 |
+
message = email.message_from_bytes(file.read())
|
150 |
+
received_headers = message.get_all('Received')
|
151 |
+
# print(received_headers)
|
152 |
+
version_string = 'version'
|
153 |
+
try:
|
154 |
+
for received_header in received_headers:
|
155 |
+
if version_string in received_header:
|
156 |
+
return 1
|
157 |
+
except TypeError:
|
158 |
+
return 0
|
159 |
+
return 0
|
160 |
+
def get_tags_from_html(html_content):
|
161 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
162 |
+
tag_list = []
|
163 |
+
html_tags = soup.find_all()
|
164 |
+
for tag in html_tags:
|
165 |
+
tag_list += [tag.name]
|
166 |
+
# print(tag_list)
|
167 |
+
return tag_list
|
168 |
+
import ipaddress
|
169 |
+
from urllib.parse import urlparse
|
170 |
+
import urllib.request
|
171 |
+
from bs4 import BeautifulSoup
|
172 |
+
import re
|
173 |
+
import email
|
174 |
+
|
175 |
+
#get urls in html content
|
176 |
+
def get_urls_from_html(html_content):
|
177 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
178 |
+
urls = []
|
179 |
+
# get all the urls
|
180 |
+
anchor_tags = soup.find_all('a')
|
181 |
+
for tag in anchor_tags:
|
182 |
+
href = tag.get('href')
|
183 |
+
if href:
|
184 |
+
if re.match('^https?://', href):
|
185 |
+
# print(href)
|
186 |
+
urls += [href]
|
187 |
+
return urls
|
188 |
+
def get_text(file_path):
|
189 |
+
with open(file_path, 'rb') as file:
|
190 |
+
message = email.message_from_bytes(file.read())
|
191 |
+
text_content = ""
|
192 |
+
for part in message.walk():
|
193 |
+
if part.get_content_type() == 'text/plain':
|
194 |
+
text_content += part.get_payload(decode=True).decode('iso-8859-1')
|
195 |
+
# print(text_content)
|
196 |
+
return text_content.replace("\n","")
|
197 |
+
if text_content == "":
|
198 |
+
return get_text_from_html(get_html_general(file_path));
|
199 |
+
def get_num_words(file_path):
|
200 |
+
if get_text(file_path) != "":
|
201 |
+
words = len(get_text(file_path).split())
|
202 |
+
return words
|
203 |
+
if get_html_general(file_path) != "":
|
204 |
+
words = len(get_text_from_html(get_html_general(file_path)).split())
|
205 |
+
return words
|
206 |
+
else:
|
207 |
+
return 0
|
208 |
+
|
209 |
+
# get how many characters in the email text or html
|
210 |
+
def get_num_chars(file_path):
|
211 |
+
if get_text(file_path) != "":
|
212 |
+
chars = len(get_text(file_path).replace(" ",""))
|
213 |
+
return chars
|
214 |
+
if get_html_general(file_path) != "":
|
215 |
+
chars = len(get_text_from_html(get_html_general(file_path)).replace(" ",""))
|
216 |
+
return chars
|
217 |
+
else:
|
218 |
+
return 0
|
219 |
+
|
220 |
+
#calculate the body richness by dividing number of words with number of characters
|
221 |
+
def get_body_richness(filepath):
|
222 |
+
if get_num_chars(filepath) == 0: return 0
|
223 |
+
return get_num_words(filepath)/get_num_chars(filepath)
|
224 |
+
|
225 |
+
#get how many function words is in the content
|
226 |
+
def get_num_FunctionWords(file_path):
|
227 |
+
function_words = ["account","access","bank","credit","click","identity","inconvenience","information","limited","log","minutes","password","recently","risk","social","security","service","suspended"]
|
228 |
+
content = ""
|
229 |
+
count = 0
|
230 |
+
if get_text(file_path) != "":
|
231 |
+
content = get_text(file_path).split()
|
232 |
+
elif get_html_general(file_path) != "":
|
233 |
+
content = get_text_from_html(get_html_general(file_path)).split()
|
234 |
+
else:
|
235 |
+
return None
|
236 |
+
for w in function_words:
|
237 |
+
if w in content:
|
238 |
+
count += 1
|
239 |
+
return count
|
240 |
+
|
241 |
+
|
242 |
+
def get_email_html(file_path):
|
243 |
+
with open(file_path, 'rb') as file:
|
244 |
+
content = email.message_from_bytes(file.read())
|
245 |
+
html_content = ""
|
246 |
+
for part in content.walk():
|
247 |
+
if part.get_content_type() == 'text/html':
|
248 |
+
html_content += part.get_payload(decode=True).decode('iso-8859-1')
|
249 |
+
html_content.replace("\n","")
|
250 |
+
if html_content != "":
|
251 |
+
# print("Found html at "+file_path)
|
252 |
+
return html_content
|
253 |
+
else:
|
254 |
+
# print("No html content found at "+file_path)
|
255 |
+
return ""
|
256 |
+
|
257 |
+
#get how many words in subject
|
258 |
+
def get_num_sbj(file_path):
|
259 |
+
count = len(get_subject(file_path).split())
|
260 |
+
return count
|
261 |
+
def get_subject(file_path):
|
262 |
+
with open(file_path, 'rb') as file:
|
263 |
+
message = email.message_from_bytes(file.read())
|
264 |
+
headers = message.items()
|
265 |
+
# Print the headers
|
266 |
+
subject = ""
|
267 |
+
for header in headers:
|
268 |
+
if header[0] == "Subject":
|
269 |
+
# print(header[1])
|
270 |
+
subject = header[1]
|
271 |
+
break
|
272 |
+
# if subject == "":
|
273 |
+
# print("No subject found")
|
274 |
+
subject = re.sub(r"\s+", " ", str(subject))
|
275 |
+
return subject
|
276 |
+
|
277 |
+
|
278 |
+
def get_sender(file_path):
|
279 |
+
with open(file_path, 'rb') as file:
|
280 |
+
message = email.message_from_bytes(file.read())
|
281 |
+
headers = message.items()
|
282 |
+
# Print the headers
|
283 |
+
sender = ""
|
284 |
+
for header in headers:
|
285 |
+
if header[0] == "From":
|
286 |
+
# print(header[1])
|
287 |
+
sender = header[1]
|
288 |
+
break
|
289 |
+
if sender == "":
|
290 |
+
return None
|
291 |
+
# subject = re.sub(r"\s+", " ", str(subject))
|
292 |
+
return sender
|
293 |
+
|
294 |
+
#get how many characters in subject
|
295 |
+
def get_num_sbjChar(file_path):
|
296 |
+
count = len(get_subject(file_path))
|
297 |
+
return count
|
298 |
+
|
299 |
+
#claculate the subject richness by dividing words with characters
|
300 |
+
def get_sbj_richness(file_path):
|
301 |
+
if get_num_sbjChar(file_path) == 0:return 0
|
302 |
+
return get_num_sbj(file_path)/get_num_sbjChar(file_path)
|
303 |
+
|
304 |
+
# get how many urls have ip address in it
|
305 |
+
def get_num_urls_ip(file_path):
|
306 |
+
content = get_html_general(file_path)
|
307 |
+
if content == "": return 0
|
308 |
+
urls = get_urls_from_html(content)
|
309 |
+
num_ip = 0
|
310 |
+
for url in urls:
|
311 |
+
from urllib.parse import urlparse
|
312 |
+
hostname = urlparse(url).hostname
|
313 |
+
try:
|
314 |
+
ip_address = ipaddress.ip_address(hostname)
|
315 |
+
num_ip+=1
|
316 |
+
# print(f"{url} contains an IP address: {ip_address}")
|
317 |
+
except ValueError:
|
318 |
+
pass
|
319 |
+
# print(f"{url} does not contain an IP address")
|
320 |
+
|
321 |
+
return num_ip
|
322 |
+
|
323 |
+
# return the total amount of urls in html content
|
324 |
+
def get_num_urls(file_path):
|
325 |
+
urls = get_urls_from_html(get_html_general(file_path))
|
326 |
+
if urls == []:
|
327 |
+
return None
|
328 |
+
return len(urls)
|
329 |
+
|
330 |
+
# get how many image urls in the html
|
331 |
+
def get_num_image_urls(file_path):
|
332 |
+
soup = BeautifulSoup(get_html_general(file_path), 'html.parser')
|
333 |
+
|
334 |
+
# Find all <a> tags that contain an <img> tag
|
335 |
+
image_links = soup.find_all('a', href=True, recursive=True, limit=None, string=None)
|
336 |
+
image_links_with_img = [link for link in image_links if link.find('img')]
|
337 |
+
return len(image_links_with_img)
|
338 |
+
# Extract the href and src attributes of each image link
|
339 |
+
# for link in image_links_with_img:
|
340 |
+
# href = link['href']
|
341 |
+
# src = link.find('img')['src']
|
342 |
+
# print(f"Clickable image link: {href} - Image URL: {src}")
|
343 |
+
|
344 |
+
# get numbers of urls contain domain name
|
345 |
+
def get_num_domain_urls(file_path):
|
346 |
+
urls = get_urls_from_html(get_html_general(file_path))
|
347 |
+
domains = set()
|
348 |
+
for url in urls:
|
349 |
+
match = re.search(r'https?://([^/]+)/', url)
|
350 |
+
if match:
|
351 |
+
domain = match.group(1)
|
352 |
+
domains.add(domain)
|
353 |
+
|
354 |
+
# Count the number of domains in the set and print the result
|
355 |
+
num_domains = len(domains)
|
356 |
+
return num_domains
|
357 |
+
|
358 |
+
|
359 |
+
#get how many urls contain port info
|
360 |
+
def get_num_url_ports(file_path):
|
361 |
+
urls = get_urls_from_html(get_html_general(file_path))
|
362 |
+
count = 0
|
363 |
+
for url in urls:
|
364 |
+
parsed_url = urlparse(url)
|
365 |
+
# Check if the parsed URL includes a port number
|
366 |
+
if parsed_url.port:
|
367 |
+
count += 1
|
368 |
+
# print(f'The URL "{url}" contains port {parsed_url.port}')
|
369 |
+
# else:
|
370 |
+
# print(f'The URL "{url}" does not contain a port')
|
371 |
+
return count
|
372 |
+
|
373 |
+
|
374 |
+
#get how many characters in sender
|
375 |
+
def get_chars_sender(file_path):
|
376 |
+
sender = get_sender(file_path)
|
377 |
+
return len(str(sender))
|
save_models/RF_Num.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2442b46ad908f4a41ce2030e10d3e59b92635396fb95c3a0d85aa74262720ef5
|
3 |
+
size 5911369
|
save_models/RF_extra.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:881e7727b12970a7e473e00944f6bfbf9afd732300ce48af8d714e1ceafcfb06
|
3 |
+
size 183913
|
save_models/SVM_finalcontent.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a4541c52b73022168b124d0f115f717e55f50553fe6eea9afccd07524de0e019
|
3 |
+
size 4304747
|
save_models/Stack_tag.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2e658df654d3620130d41ac50be3788e29e81b58c6974e1e89d06c59ad14a7f4
|
3 |
+
size 7632960
|
static/css/styles.css
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
body {
|
2 |
+
background-color: lightblue;
|
3 |
+
margin: 0;
|
4 |
+
color: black;
|
5 |
+
|
6 |
+
}
|
7 |
+
|
8 |
+
h1 { color: #111; font-family: 'Helvetica Neue', sans-serif; font-size: 50px; font-weight: bold; letter-spacing: -1px; line-height: 1; text-align: center; }
|
9 |
+
|
10 |
+
h2 { color: #111; font-family: 'Open Sans', sans-serif; font-size: 20px; font-weight: 300; line-height: 32px; margin: 0 0 30px; text-align: center; }
|
11 |
+
|
12 |
+
p { color: #685206; font-family: 'Helvetica Neue', sans-serif; font-size: 15px; line-height: 24px; margin: 0 0 24px; text-align: justify; text-justify: inter-word; }
|
13 |
+
|
14 |
+
.list {
|
15 |
+
max-width: 400px;
|
16 |
+
overflow-x: auto;
|
17 |
+
list-style: none;
|
18 |
+
}
|
19 |
+
|
20 |
+
.container {
|
21 |
+
display: flex;
|
22 |
+
}
|
23 |
+
|
24 |
+
.box {
|
25 |
+
border: 5px dashed black;
|
26 |
+
width: 500px;
|
27 |
+
margin: 50px;
|
28 |
+
padding: 10px;
|
29 |
+
float: left;
|
30 |
+
}
|
31 |
+
|
32 |
+
.pretty {
|
33 |
+
font-family: "Helvetica Neue", Arial, sans-serif;
|
34 |
+
font-size: 14px;
|
35 |
+
line-height: 1.5;
|
36 |
+
text-align: left;
|
37 |
+
text-shadow: 1px 1px 1px rgba(0, 0, 0, 0.1);
|
38 |
+
text-transform: uppercase;
|
39 |
+
letter-spacing: 1px;
|
40 |
+
word-spacing: 2px;
|
41 |
+
list-style: none;
|
42 |
+
}
|
43 |
+
|
44 |
+
.header {
|
45 |
+
padding: 10px;
|
46 |
+
text-align: center;
|
47 |
+
font-size: 24px;
|
48 |
+
border: 5px dashed black;
|
49 |
+
}
|
50 |
+
|
51 |
+
|
52 |
+
.button-81 {
|
53 |
+
background-color: #fff;
|
54 |
+
border: 0 solid #e2e8f0;
|
55 |
+
border-radius: 1.5rem;
|
56 |
+
box-sizing: border-box;
|
57 |
+
color: #0d172a;
|
58 |
+
cursor: pointer;
|
59 |
+
display: inline-block;
|
60 |
+
font-family: "Basier circle",-apple-system,system-ui,"Segoe UI",Roboto,"Helvetica Neue",Arial,"Noto Sans",sans-serif,"Apple Color Emoji","Segoe UI Emoji","Segoe UI Symbol","Noto Color Emoji";
|
61 |
+
font-size: 1.1rem;
|
62 |
+
font-weight: 600;
|
63 |
+
line-height: 1;
|
64 |
+
padding: 1rem 1.6rem;
|
65 |
+
text-align: center;
|
66 |
+
text-decoration: none #0d172a solid;
|
67 |
+
text-decoration-thickness: auto;
|
68 |
+
transition: all .1s cubic-bezier(.4, 0, .2, 1);
|
69 |
+
box-shadow: 0px 1px 2px rgba(166, 175, 195, 0.25);
|
70 |
+
user-select: none;
|
71 |
+
-webkit-user-select: none;
|
72 |
+
touch-action: manipulation;
|
73 |
+
}
|
74 |
+
|
75 |
+
.button-81:hover {
|
76 |
+
background-color: #1e293b;
|
77 |
+
color: #fff;
|
78 |
+
}
|
79 |
+
|
80 |
+
@media (min-width: 768px) {
|
81 |
+
.button-81 {
|
82 |
+
font-size: 1.125rem;
|
83 |
+
padding: 1rem 2rem;
|
84 |
+
}
|
85 |
+
}
|
86 |
+
|
templates/home.html
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html>
|
3 |
+
<head>
|
4 |
+
<title>Data Visualization</title>
|
5 |
+
<link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='css/styles.css') }}">
|
6 |
+
</head>
|
7 |
+
<body>
|
8 |
+
<div class="header">
|
9 |
+
<h1>Welcome to Phishing Email Detection!</h1>
|
10 |
+
<form method="POST" action="/upload" enctype="multipart/form-data">
|
11 |
+
<input type="file" name="file" id="file-input" accept=".txt,.eml">
|
12 |
+
<input type="submit" value="Upload" class="button-81">
|
13 |
+
</form>
|
14 |
+
</div>
|
15 |
+
<div class="container">
|
16 |
+
<div class="box">
|
17 |
+
{% if content %}
|
18 |
+
<h2>{{ content }}</h2>
|
19 |
+
{% endif %}
|
20 |
+
<ul class="list">
|
21 |
+
{% for feature in features %}
|
22 |
+
<li><pre class="pretty">{{ feature }}: <p>{{ features[feature][0] }}</p></pre></li>
|
23 |
+
{% endfor %}
|
24 |
+
</ul>
|
25 |
+
</div>
|
26 |
+
<div class="box">
|
27 |
+
<h2>Prediction</h2>
|
28 |
+
<ul class="pretty">
|
29 |
+
<li>Content prediction: <p>{{ pre_content }}</p></li>
|
30 |
+
<li>Html Tag prediction: <p>{{ pre_tag }}</p></li>
|
31 |
+
<li>Numeric prediction: <p>{{ pre_num }}</p></li>
|
32 |
+
<li>Extra prediction: <p>{{ pre_extra }}</p></li>
|
33 |
+
</ul>
|
34 |
+
</div>
|
35 |
+
</div>
|
36 |
+
</body>
|
37 |
+
</html>
|
vectorizer/content_tfidf.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c78b2719d42bf0c36db85c60270770fb6decd878bf2e61cddf13bf2cdee8e19f
|
3 |
+
size 4422275
|
vectorizer/extra_scaler.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3548c38d06e7e04f11df4bcdb29ad7aaeee985af2e3701f4f9d51a79cd7de041
|
3 |
+
size 776
|
vectorizer/html_cv.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:84cf833e1bd8975474669746947e93a7bf4b9ec1046f9d8e88d98dc459c860f9
|
3 |
+
size 6814
|
vectorizer/num_scaler.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f1a847823b219353781e60d8672e4c2b88720d111dc0a543c3ece441f52ce06f
|
3 |
+
size 665
|