Spaces:
Runtime error
Runtime error
Commit
·
ec5e027
1
Parent(s):
fd712e2
Update analze.py
Browse files
analze.py
CHANGED
@@ -36,8 +36,8 @@ def save_file(up_file):
|
|
36 |
f.write(up_file.getbuffer())
|
37 |
print("file save: "+up_file.name)
|
38 |
|
39 |
-
def text_feature(
|
40 |
-
text = get_text(
|
41 |
# print(text)
|
42 |
if text != "":
|
43 |
text = text.split()
|
@@ -45,20 +45,20 @@ def text_feature(filepath):
|
|
45 |
dataf = pd.DataFrame([[textlist]], columns=['text'])
|
46 |
return dataf
|
47 |
|
48 |
-
def html_tags_feature(
|
49 |
-
tags = get_tags_from_html(get_html_general(
|
50 |
taglist = ' '.join(tags) if tags !=[] else []
|
51 |
dataf = pd.DataFrame([[taglist]], columns=['tags'])
|
52 |
return dataf
|
53 |
|
54 |
-
def extra_feature(
|
55 |
-
spf = check_spf(
|
56 |
-
dkim = check_dkim(
|
57 |
-
dmarc = check_dmarc(
|
58 |
-
deliver_receiver = check_deliver_receiver(
|
59 |
-
encript = check_encript(
|
60 |
-
onclick = get_onclicks(
|
61 |
-
popwindow = check_popWindow(
|
62 |
extra_data_row = [spf, dkim, dmarc, deliver_receiver, encript, onclick, popwindow]
|
63 |
extra_data_row = [0 if x is None else x for x in extra_data_row]
|
64 |
extra_data_row = [1 if x is True else x for x in extra_data_row]
|
@@ -67,32 +67,32 @@ def extra_feature(filepath):
|
|
67 |
columns=['SPF(Pass:1,Neutral:2,Softdail:3,None:0)', 'DKIM', 'DMARC', 'Deliver-to Matches Receiver', 'Message_encrtpted', 'Onclick_events', 'Popwindow'])
|
68 |
return extra_data
|
69 |
|
70 |
-
def num_feature(
|
71 |
-
body_richness = get_body_richness(
|
72 |
-
func_words = get_num_FunctionWords(
|
73 |
-
sbj_richness = get_sbj_richness(
|
74 |
-
urls = get_num_urls(
|
75 |
-
ipurls = get_num_urls_ip(
|
76 |
-
imageurls = get_num_image_urls(
|
77 |
-
domainurls = get_num_domain_urls(
|
78 |
-
urlport = get_num_url_ports(
|
79 |
-
sen_chars = get_chars_sender(
|
80 |
num_data_row = [body_richness, func_words, sbj_richness, urls, ipurls, imageurls, domainurls, urlport, sen_chars]
|
81 |
num_data_row = [0 if x is None else x for x in num_data_row]
|
82 |
num_data = pd.DataFrame([num_data_row],
|
83 |
columns=['body richness', 'Include function words', 'Subject richness', 'Numers of URLs', 'IPURLs', 'ImageURLs',
|
84 |
'DomainURLs', 'URLs contain port information', 'Characters in senders'])
|
85 |
return num_data
|
86 |
-
def get_features(
|
87 |
# text
|
88 |
-
textlist = text_feature(
|
89 |
# html tags
|
90 |
-
taglist = html_tags_feature(
|
91 |
#extra feature
|
92 |
-
extra_data = extra_feature(
|
93 |
# Numeric data
|
94 |
|
95 |
-
num_data = num_feature(
|
96 |
combined_df = pd.concat([textlist, taglist, num_data,extra_data], axis=1)
|
97 |
# print(combined_df)
|
98 |
return combined_df
|
|
|
36 |
f.write(up_file.getbuffer())
|
37 |
print("file save: "+up_file.name)
|
38 |
|
39 |
+
def text_feature(file):
|
40 |
+
text = get_text(file)
|
41 |
# print(text)
|
42 |
if text != "":
|
43 |
text = text.split()
|
|
|
45 |
dataf = pd.DataFrame([[textlist]], columns=['text'])
|
46 |
return dataf
|
47 |
|
48 |
+
def html_tags_feature(file):
|
49 |
+
tags = get_tags_from_html(get_html_general(file))
|
50 |
taglist = ' '.join(tags) if tags !=[] else []
|
51 |
dataf = pd.DataFrame([[taglist]], columns=['tags'])
|
52 |
return dataf
|
53 |
|
54 |
+
def extra_feature(file):
|
55 |
+
spf = check_spf(file)
|
56 |
+
dkim = check_dkim(file)
|
57 |
+
dmarc = check_dmarc(file)
|
58 |
+
deliver_receiver = check_deliver_receiver(file)
|
59 |
+
encript = check_encript(file)
|
60 |
+
onclick = get_onclicks(file)
|
61 |
+
popwindow = check_popWindow(file)
|
62 |
extra_data_row = [spf, dkim, dmarc, deliver_receiver, encript, onclick, popwindow]
|
63 |
extra_data_row = [0 if x is None else x for x in extra_data_row]
|
64 |
extra_data_row = [1 if x is True else x for x in extra_data_row]
|
|
|
67 |
columns=['SPF(Pass:1,Neutral:2,Softdail:3,None:0)', 'DKIM', 'DMARC', 'Deliver-to Matches Receiver', 'Message_encrtpted', 'Onclick_events', 'Popwindow'])
|
68 |
return extra_data
|
69 |
|
70 |
+
def num_feature(file):
|
71 |
+
body_richness = get_body_richness(file)
|
72 |
+
func_words = get_num_FunctionWords(file)
|
73 |
+
sbj_richness = get_sbj_richness(file)
|
74 |
+
urls = get_num_urls(file)
|
75 |
+
ipurls = get_num_urls_ip(file)
|
76 |
+
imageurls = get_num_image_urls(file)
|
77 |
+
domainurls = get_num_domain_urls(file)
|
78 |
+
urlport = get_num_url_ports(file)
|
79 |
+
sen_chars = get_chars_sender(file)
|
80 |
num_data_row = [body_richness, func_words, sbj_richness, urls, ipurls, imageurls, domainurls, urlport, sen_chars]
|
81 |
num_data_row = [0 if x is None else x for x in num_data_row]
|
82 |
num_data = pd.DataFrame([num_data_row],
|
83 |
columns=['body richness', 'Include function words', 'Subject richness', 'Numers of URLs', 'IPURLs', 'ImageURLs',
|
84 |
'DomainURLs', 'URLs contain port information', 'Characters in senders'])
|
85 |
return num_data
|
86 |
+
def get_features(file):
|
87 |
# text
|
88 |
+
textlist = text_feature(file)
|
89 |
# html tags
|
90 |
+
taglist = html_tags_feature(file)
|
91 |
#extra feature
|
92 |
+
extra_data = extra_feature(file)
|
93 |
# Numeric data
|
94 |
|
95 |
+
num_data = num_feature(file)
|
96 |
combined_df = pd.concat([textlist, taglist, num_data,extra_data], axis=1)
|
97 |
# print(combined_df)
|
98 |
return combined_df
|