File size: 5,603 Bytes
01cbf36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
from modules import *
from pathlib import Path
import pandas as pd
from flask import Flask, render_template, request
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from joblib import load
import sklearn
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('omw-1.4')
# nltk.download('wordnet')

def check_file_type(file):
    file_extension = Path(file.filename).suffix.lower()
    if file_extension == '.eml' or file_extension == '.txt':
        save_file(file)
        return 'Extracted Features'
        # return get_features('email files/' + file.filename)
    else:
        return "Please select .eml or .txt file."

def save_file(file):
    file_path = 'email files/' + file.filename
    with open(file_path, 'w') as f:
        f.write(file.read().decode('utf-8'))

def text_feature(filepath):
    text = get_text(filepath)
    # print(text)
    if text != "":
        text = text.split()
        textlist = ' '.join(text)
        dataf = pd.DataFrame([[textlist]], columns=['text'])
        return dataf

def html_tags_feature(filepath):
    tags = get_tags_from_html(get_html_general(filepath))
    taglist = ' '.join(tags) if tags !=[] else []
    dataf = pd.DataFrame([[taglist]], columns=['tags'])
    return dataf

def extra_feature(filepath):
    spf = check_spf(filepath)
    dkim = check_dkim(filepath)
    dmarc = check_dmarc(filepath)
    deliver_receiver = check_deliver_receiver(filepath)
    encript = check_encript(filepath)
    onclick = get_onclicks(filepath)
    popwindow = check_popWindow(filepath)
    extra_data_row = [spf, dkim, dmarc, deliver_receiver, encript, onclick, popwindow]
    extra_data_row = [0 if x is None else x for x in extra_data_row]
    extra_data_row = [1 if x is True else x for x in extra_data_row]
    extra_data_row = [0 if x is False else x for x in extra_data_row]
    extra_data = pd.DataFrame([extra_data_row],
                              columns=['SPF(Pass:1,Neutral:2,Softdail:3,None:0)', 'DKIM', 'DMARC', 'Deliver-to Matches Receiver', 'Message_encrtpted', 'Onclick_events', 'Popwindow'])
    return extra_data

def num_feature(filepath):
    body_richness = get_body_richness(filepath)
    func_words = get_num_FunctionWords(filepath)
    sbj_richness = get_sbj_richness(filepath)
    urls = get_num_urls(filepath)
    ipurls = get_num_urls_ip(filepath)
    imageurls = get_num_image_urls(filepath)
    domainurls = get_num_domain_urls(filepath)
    urlport = get_num_url_ports(filepath)
    sen_chars = get_chars_sender(filepath)
    num_data_row = [body_richness, func_words, sbj_richness, urls, ipurls, imageurls, domainurls, urlport, sen_chars]
    num_data_row = [0 if x is None else x for x in num_data_row]
    num_data = pd.DataFrame([num_data_row],
                            columns=['body richness', 'Include function words', 'Subject richness', 'Numers of URLs', 'IPURLs', 'ImageURLs',
                                     'DomainURLs', 'URLs contain port information', 'Characters in senders'])
    return num_data
def get_features(filepath):
    # text
    textlist = text_feature(filepath)
    # html tags
    taglist = html_tags_feature(filepath)
    #extra feature
    extra_data = extra_feature(filepath)
    # Numeric data

    num_data = num_feature(filepath)
    combined_df = pd.concat([textlist, taglist, num_data,extra_data], axis=1)
    # print(combined_df)
    return combined_df


def predict_content(content):
    content_clf = load("save_models/SVM_finalcontent.pkl")
    predict = content_clf.predict(preprocess_content(content))
    return "Legitimate" if predict[0]=='ham' else "Phishing"

def predict_html(html_tag):
    html_clf = load("save_models/Stack_tag.pkl")
    predict = html_clf.predict(preprocess_html(html_tag))
    return "Legitimate" if predict[0]=='ham' else "Phishing"

def predict_num(num_df):
    num_clf = load("save_models/RF_Num.pkl")
    predict = num_clf.predict(preprocess_num(num_df))
    return "Legitimate" if predict[0]=='ham' else "Phishing"

def predict_extra(extra_df):
    extra_clf = load("save_models/RF_extra.pkl")
    predict = extra_clf.predict(preprocess_extra(extra_df))
    return "Legitimate" if predict[0]=='ham' else "Phishing"

def preprocess_content(content):
    with open('vectorizer/content_tfidf.pickle', 'rb') as f:
        tfidf = pickle.load(f)
    # Transform feature input to TF-IDF
    content_tfidf = tfidf.transform(content)
    return content_tfidf

def preprocess_html(html_tag):
    with open('vectorizer/html_cv.pickle', 'rb') as f:
        cv = pickle.load(f)
    tag_data = cv.transform(html_tag)
    return tag_data

def preprocess_num(num_df):
    with open('vectorizer/num_scaler.pkl', 'rb') as f:
        num_scaler = pickle.load(f)
    scale_num = num_scaler.transform(num_df.values)
    return scale_num

def preprocess_extra(extra_df):
    with open('vectorizer/extra_scaler.pkl', 'rb') as f:
        extra_scaler = pickle.load(f)
    scale_extra = extra_scaler.transform(extra_df.values)
    return scale_extra


lemmatizer = WordNetLemmatizer()
def customtokenize(str):
    # Split string as tokens
    tokens = nltk.word_tokenize(str)
    # Filter for stopwords
    nostop = list(filter(lambda token: token not in stopwords.words('english'), tokens))
    # Perform lemmatization
    lemmatized = [lemmatizer.lemmatize(word) for word in nostop]
    return lemmatized