from modules import * from pathlib import Path import pandas as pd from flask import Flask, render_template, request import nltk import pickle from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from joblib import load import sklearn import ssl import os try: _create_unverified_https_context = ssl._create_unverified_context except AttributeError: pass else: ssl._create_default_https_context = _create_unverified_https_context # nltk.download('stopwords') # nltk.download('punkt') # nltk.download('omw-1.4') # nltk.download('wordnet') def text_feature(file): text = get_text(file) # print(text) if text != "": text = text.split() textlist = ' '.join(text) dataf = pd.DataFrame([[textlist]], columns=['text']) return dataf def html_tags_feature(file): tags = get_tags_from_html(get_html_general(file)) taglist = ' '.join(tags) if tags !=[] else [] dataf = pd.DataFrame([[taglist]], columns=['tags']) return dataf def extra_feature(file): spf = check_spf(file) dkim = check_dkim(file) dmarc = check_dmarc(file) deliver_receiver = check_deliver_receiver(file) encript = check_encript(file) onclick = get_onclicks(file) popwindow = check_popWindow(file) extra_data_row = [spf, dkim, dmarc, deliver_receiver, encript, onclick, popwindow] extra_data_row = [0 if x is None else x for x in extra_data_row] extra_data_row = [1 if x is True else x for x in extra_data_row] extra_data_row = [0 if x is False else x for x in extra_data_row] extra_data = pd.DataFrame([extra_data_row], columns=['SPF(Pass:1,Neutral:2,Softdail:3,None:0)', 'DKIM', 'DMARC', 'Deliver-to Matches Receiver', 'Message_encrtpted', 'Onclick_events', 'Popwindow']) return extra_data def num_feature(file): body_richness = get_body_richness(file) func_words = get_num_FunctionWords(file) sbj_richness = get_sbj_richness(file) urls = get_num_urls(file) ipurls = get_num_urls_ip(file) imageurls = get_num_image_urls(file) domainurls = get_num_domain_urls(file) urlport = get_num_url_ports(file) sen_chars = get_chars_sender(file) num_data_row = [body_richness, func_words, sbj_richness, urls, ipurls, imageurls, domainurls, urlport, sen_chars] num_data_row = [0 if x is None else x for x in num_data_row] num_data = pd.DataFrame([num_data_row], columns=['body richness', 'Include function words', 'Subject richness', 'Numers of URLs', 'IPURLs', 'ImageURLs', 'DomainURLs', 'URLs contain port information', 'Characters in senders']) return num_data def get_features(file): # text textlist = text_feature(file) # html tags taglist = html_tags_feature(file) #extra feature extra_data = extra_feature(file) # Numeric data num_data = num_feature(file) combined_df = pd.concat([textlist, taglist, num_data,extra_data], axis=1) # print(combined_df) return combined_df def predict_content(content): content_clf = load("save_models/SVM_finalcontent.pkl") predict = content_clf.predict(preprocess_content(content)) return "Legitimate" if predict[0]=='ham' else "Phishing" def predict_html(html_tag): html_clf = load("save_models/Stack_tag.pkl") predict = html_clf.predict(preprocess_html(html_tag)) return "Legitimate" if predict[0]=='ham' else "Phishing" def predict_num(num_df): num_clf = load("save_models/RF_Num.pkl") predict = num_clf.predict(preprocess_num(num_df)) return "Legitimate" if predict[0]=='ham' else "Phishing" def predict_extra(extra_df): extra_clf = load("save_models/RF_extra.pkl") predict = extra_clf.predict(preprocess_extra(extra_df)) return "Legitimate" if predict[0]=='ham' else "Phishing" def preprocess_content(content): with open('vectorizer/content_tfidf.pickle', 'rb') as f: tfidf = pickle.load(f) # Transform feature input to TF-IDF content_tfidf = tfidf.transform(content) return content_tfidf def preprocess_html(html_tag): with open('vectorizer/html_cv.pickle', 'rb') as f: cv = pickle.load(f) tag_data = cv.transform(html_tag) return tag_data def preprocess_num(num_df): with open('vectorizer/num_scaler.pkl', 'rb') as f: num_scaler = pickle.load(f) scale_num = num_scaler.transform(num_df.values) return scale_num def preprocess_extra(extra_df): with open('vectorizer/extra_scaler.pkl', 'rb') as f: extra_scaler = pickle.load(f) scale_extra = extra_scaler.transform(extra_df.values) return scale_extra lemmatizer = WordNetLemmatizer() def customtokenize(str): # Split string as tokens tokens = nltk.word_tokenize(str) # Filter for stopwords nostop = list(filter(lambda token: token not in stopwords.words('english'), tokens)) # Perform lemmatization lemmatized = [lemmatizer.lemmatize(word) for word in nostop] return lemmatized