allenchienxxx commited on
Commit
ec5e027
·
1 Parent(s): fd712e2

Update analze.py

Browse files
Files changed (1) hide show
  1. analze.py +27 -27
analze.py CHANGED
@@ -36,8 +36,8 @@ def save_file(up_file):
36
  f.write(up_file.getbuffer())
37
  print("file save: "+up_file.name)
38
 
39
- def text_feature(filepath):
40
- text = get_text(filepath)
41
  # print(text)
42
  if text != "":
43
  text = text.split()
@@ -45,20 +45,20 @@ def text_feature(filepath):
45
  dataf = pd.DataFrame([[textlist]], columns=['text'])
46
  return dataf
47
 
48
- def html_tags_feature(filepath):
49
- tags = get_tags_from_html(get_html_general(filepath))
50
  taglist = ' '.join(tags) if tags !=[] else []
51
  dataf = pd.DataFrame([[taglist]], columns=['tags'])
52
  return dataf
53
 
54
- def extra_feature(filepath):
55
- spf = check_spf(filepath)
56
- dkim = check_dkim(filepath)
57
- dmarc = check_dmarc(filepath)
58
- deliver_receiver = check_deliver_receiver(filepath)
59
- encript = check_encript(filepath)
60
- onclick = get_onclicks(filepath)
61
- popwindow = check_popWindow(filepath)
62
  extra_data_row = [spf, dkim, dmarc, deliver_receiver, encript, onclick, popwindow]
63
  extra_data_row = [0 if x is None else x for x in extra_data_row]
64
  extra_data_row = [1 if x is True else x for x in extra_data_row]
@@ -67,32 +67,32 @@ def extra_feature(filepath):
67
  columns=['SPF(Pass:1,Neutral:2,Softdail:3,None:0)', 'DKIM', 'DMARC', 'Deliver-to Matches Receiver', 'Message_encrtpted', 'Onclick_events', 'Popwindow'])
68
  return extra_data
69
 
70
- def num_feature(filepath):
71
- body_richness = get_body_richness(filepath)
72
- func_words = get_num_FunctionWords(filepath)
73
- sbj_richness = get_sbj_richness(filepath)
74
- urls = get_num_urls(filepath)
75
- ipurls = get_num_urls_ip(filepath)
76
- imageurls = get_num_image_urls(filepath)
77
- domainurls = get_num_domain_urls(filepath)
78
- urlport = get_num_url_ports(filepath)
79
- sen_chars = get_chars_sender(filepath)
80
  num_data_row = [body_richness, func_words, sbj_richness, urls, ipurls, imageurls, domainurls, urlport, sen_chars]
81
  num_data_row = [0 if x is None else x for x in num_data_row]
82
  num_data = pd.DataFrame([num_data_row],
83
  columns=['body richness', 'Include function words', 'Subject richness', 'Numers of URLs', 'IPURLs', 'ImageURLs',
84
  'DomainURLs', 'URLs contain port information', 'Characters in senders'])
85
  return num_data
86
- def get_features(filepath):
87
  # text
88
- textlist = text_feature(filepath)
89
  # html tags
90
- taglist = html_tags_feature(filepath)
91
  #extra feature
92
- extra_data = extra_feature(filepath)
93
  # Numeric data
94
 
95
- num_data = num_feature(filepath)
96
  combined_df = pd.concat([textlist, taglist, num_data,extra_data], axis=1)
97
  # print(combined_df)
98
  return combined_df
 
36
  f.write(up_file.getbuffer())
37
  print("file save: "+up_file.name)
38
 
39
+ def text_feature(file):
40
+ text = get_text(file)
41
  # print(text)
42
  if text != "":
43
  text = text.split()
 
45
  dataf = pd.DataFrame([[textlist]], columns=['text'])
46
  return dataf
47
 
48
+ def html_tags_feature(file):
49
+ tags = get_tags_from_html(get_html_general(file))
50
  taglist = ' '.join(tags) if tags !=[] else []
51
  dataf = pd.DataFrame([[taglist]], columns=['tags'])
52
  return dataf
53
 
54
+ def extra_feature(file):
55
+ spf = check_spf(file)
56
+ dkim = check_dkim(file)
57
+ dmarc = check_dmarc(file)
58
+ deliver_receiver = check_deliver_receiver(file)
59
+ encript = check_encript(file)
60
+ onclick = get_onclicks(file)
61
+ popwindow = check_popWindow(file)
62
  extra_data_row = [spf, dkim, dmarc, deliver_receiver, encript, onclick, popwindow]
63
  extra_data_row = [0 if x is None else x for x in extra_data_row]
64
  extra_data_row = [1 if x is True else x for x in extra_data_row]
 
67
  columns=['SPF(Pass:1,Neutral:2,Softdail:3,None:0)', 'DKIM', 'DMARC', 'Deliver-to Matches Receiver', 'Message_encrtpted', 'Onclick_events', 'Popwindow'])
68
  return extra_data
69
 
70
+ def num_feature(file):
71
+ body_richness = get_body_richness(file)
72
+ func_words = get_num_FunctionWords(file)
73
+ sbj_richness = get_sbj_richness(file)
74
+ urls = get_num_urls(file)
75
+ ipurls = get_num_urls_ip(file)
76
+ imageurls = get_num_image_urls(file)
77
+ domainurls = get_num_domain_urls(file)
78
+ urlport = get_num_url_ports(file)
79
+ sen_chars = get_chars_sender(file)
80
  num_data_row = [body_richness, func_words, sbj_richness, urls, ipurls, imageurls, domainurls, urlport, sen_chars]
81
  num_data_row = [0 if x is None else x for x in num_data_row]
82
  num_data = pd.DataFrame([num_data_row],
83
  columns=['body richness', 'Include function words', 'Subject richness', 'Numers of URLs', 'IPURLs', 'ImageURLs',
84
  'DomainURLs', 'URLs contain port information', 'Characters in senders'])
85
  return num_data
86
+ def get_features(file):
87
  # text
88
+ textlist = text_feature(file)
89
  # html tags
90
+ taglist = html_tags_feature(file)
91
  #extra feature
92
+ extra_data = extra_feature(file)
93
  # Numeric data
94
 
95
+ num_data = num_feature(file)
96
  combined_df = pd.concat([textlist, taglist, num_data,extra_data], axis=1)
97
  # print(combined_df)
98
  return combined_df