cpi-connect commited on
Commit
83fd625
·
1 Parent(s): 7c6b6a6

Upload 16 files

Browse files
app.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import AutoModelForTokenClassification
3
+ from annotated_text import annotated_text
4
+ import numpy as np
5
+ import os, joblib
6
+
7
+ from utils import get_idxs_from_text
8
+
9
+ model = AutoModelForTokenClassification.from_pretrained("CyberPeace-Institute/Cybersecurity-Knowledge-Graph", trust_remote_code=True)
10
+
11
+ role_classifiers = {}
12
+ folder_path = '/arg_role_models'
13
+ for filename in os.listdir(os.getcwd() + folder_path):
14
+ if filename.endswith('.joblib'):
15
+ file_path = os.getcwd() + os.path.join(folder_path, filename)
16
+ clf = joblib.load(file_path)
17
+ arg = filename.split(".")[0]
18
+ role_classifiers[arg] = clf
19
+
20
+ def annotate(name):
21
+ tokens = [item["token"] for item in output]
22
+ tokens = [token.replace(" ", "") for token in tokens]
23
+ text = model.tokenizer.decode([item["id"] for item in output])
24
+ idxs = get_idxs_from_text(text, tokens)
25
+ labels = [item[name] for item in output]
26
+
27
+ annotated_text_list = []
28
+ last_label = ""
29
+ cumulative_tokens = ""
30
+ last_id = 0
31
+ for idx, label in zip(idxs, labels):
32
+ to_label = label
33
+ label_short = to_label.split("-")[1] if "-" in to_label else to_label
34
+ if last_label == label_short:
35
+ cumulative_tokens += text[last_id : idx["end_idx"]]
36
+ last_id = idx["end_idx"]
37
+ else:
38
+ if last_label != "":
39
+ if last_label == "O":
40
+ annotated_text_list.append(cumulative_tokens)
41
+ else:
42
+ annotated_text_list.append((cumulative_tokens, last_label))
43
+ last_label = label_short
44
+ cumulative_tokens = idx["word"]
45
+ last_id = idx["end_idx"]
46
+ if last_label == "O":
47
+ annotated_text_list.append(cumulative_tokens)
48
+ else:
49
+ annotated_text_list.append((cumulative_tokens, last_label))
50
+ annotated_text(annotated_text_list)
51
+
52
+ def get_arg_roles(output):
53
+ args = [(idx, item["argument"], item["token"]) for idx, item in enumerate(output) if item["argument"]!= "O"]
54
+
55
+ entities = []
56
+ current_entity = None
57
+ for position, label, token in args:
58
+ if label.startswith('B-'):
59
+ if current_entity is not None:
60
+ entities.append(current_entity)
61
+ current_entity = {'label': label[2:], 'text': token.replace(" ", ""), 'start': position, 'end': position}
62
+ elif label.startswith('I-'):
63
+ if current_entity is not None:
64
+ current_entity['text'] += ' ' + token.replace(" ", "")
65
+ current_entity['end'] = position
66
+ for entity in entities:
67
+ context = model.tokenizer.decode([item["id"] for item in output[max(0, entity["start"] - 15) : min(len(output), entity["end"] + 15)]])
68
+ entity["context"] = context
69
+
70
+ for entity in entities:
71
+ if len(model.arg_2_role[entity["label"]]) > 1:
72
+ sent_embed = model.embed_model.encode(entity["context"])
73
+ arg_embed = model.embed_model.encode(entity["text"])
74
+ embed = np.concatenate((sent_embed, arg_embed))
75
+ arg_clf = role_classifiers[entity["label"]]
76
+ role_id = arg_clf.predict(embed.reshape(1, -1))
77
+ role = model.arg_2_role[entity["label"]][role_id[0]]
78
+ entity["role"] = role
79
+ else:
80
+ entity["role"] = model.arg_2_role[entity["label"]][0]
81
+
82
+ for item in output:
83
+ item["role"] = "O"
84
+ for entity in entities:
85
+ for i in range(entity["start"], entity["end"] + 1):
86
+ output[i]["role"] = entity["role"]
87
+ return output
88
+
89
+ st.title("Create Knowledge Graphs from Cyber Incidents")
90
+
91
+ text_input = st.text_area("Enter your text here", height=100)
92
+
93
+ if text_input:
94
+ output = model(text_input)
95
+ st.subheader("Event Nuggets")
96
+ annotate("nugget")
97
+ st.subheader("Event Arguments")
98
+ annotate("argument")
99
+ st.subheader("Realis of Event Nuggets")
100
+ annotate("realis")
101
+ output = get_arg_roles(output)
102
+ st.subheader("Role of the Event Arguments")
103
+ annotate("role")
arg_role_models/Capabilities.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d542202e44436144702ed00865ebd91f9714f78e5cd05277aa69fc66c15479ba
3
+ size 7728396
arg_role_models/Data.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6d04fd53240a8bc105b255c440c8d00354249cf268067ab4fbeef2fc94f73ed
3
+ size 2430452
arg_role_models/Device.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a56a3356645eb885604d651cb20d0f80887c710b1acedcbddddef5e11ddf92a2
3
+ size 4236556
arg_role_models/File.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e0ea25e1f1f7c215e5335bf7a2ee2638820762de25e84d47c8ad90601b096e9
3
+ size 3413236
arg_role_models/Money.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7481b7117ef96b4810ab7d1cba13544dd86497126a5db4546ee830d2649b8557
3
+ size 2741604
arg_role_models/Number.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18f178926cb3ea67f32e9537841b3c71abcef8b66342721e9c287d624ac19a90
3
+ size 3254948
arg_role_models/Organization.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:026de561dc01cd8f8f89ac52bbdd7c1cc8746e44627d2bb87de985273c03ad51
3
+ size 17816780
arg_role_models/PII.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25c5e38a3e1cf918fe9cb929db156de4c180d1cf8dc0ddceaef16c8a15ab6557
3
+ size 2911620
arg_role_models/Person.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60028d5e392e548f7cd8daf9681d9a181b5a8ea4c3f0a0a8f35d2d82314c3120
3
+ size 14664508
arg_role_models/Software.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9be024f9c4d0ae43db13ed4b4b0140f209b5b0c43a9bb675855a4d484c12a7d
3
+ size 4388264
arg_role_models/System.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d79586b50dec6bd6d838110ed568f66329e5266f86bc6350bcf522876d764309
3
+ size 7239048
arg_role_models/Version.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3584898a0d7af2ba4f927e91805018565218f43806ca9572751370ae8c5fa07f
3
+ size 3637220
arg_role_models/Website.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4c8e3d2f167d2c5bcef014051dd52c231ac8394d772b33343cbd1fe8424e4b7
3
+ size 3654520
test.ipynb ADDED
@@ -0,0 +1,1032 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "A new version of the following files was downloaded from https://huggingface.co/CyberPeace-Institute/Cybersecurity-Knowledge-Graph:\n",
13
+ "- configuration.py\n",
14
+ ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n"
15
+ ]
16
+ },
17
+ {
18
+ "data": {
19
+ "application/vnd.jupyter.widget-view+json": {
20
+ "model_id": "047d95084e98403589b108942daf1a0c",
21
+ "version_major": 2,
22
+ "version_minor": 0
23
+ },
24
+ "text/plain": [
25
+ "Downloading (…)esolve/main/model.py: 0%| | 0.00/6.66k [00:00<?, ?B/s]"
26
+ ]
27
+ },
28
+ "metadata": {},
29
+ "output_type": "display_data"
30
+ },
31
+ {
32
+ "data": {
33
+ "application/vnd.jupyter.widget-view+json": {
34
+ "model_id": "840315351a8e489d9e65c4285a9511b2",
35
+ "version_major": 2,
36
+ "version_minor": 0
37
+ },
38
+ "text/plain": [
39
+ "Downloading (…)event_arg_predict.py: 0%| | 0.00/12.8k [00:00<?, ?B/s]"
40
+ ]
41
+ },
42
+ "metadata": {},
43
+ "output_type": "display_data"
44
+ },
45
+ {
46
+ "data": {
47
+ "application/vnd.jupyter.widget-view+json": {
48
+ "model_id": "e211aa84be44465396acb1e1e1b5b683",
49
+ "version_major": 2,
50
+ "version_minor": 0
51
+ },
52
+ "text/plain": [
53
+ "Downloading (…)/args_model_utils.py: 0%| | 0.00/11.2k [00:00<?, ?B/s]"
54
+ ]
55
+ },
56
+ "metadata": {},
57
+ "output_type": "display_data"
58
+ },
59
+ {
60
+ "name": "stderr",
61
+ "output_type": "stream",
62
+ "text": [
63
+ "A new version of the following files was downloaded from https://huggingface.co/CyberPeace-Institute/Cybersecurity-Knowledge-Graph:\n",
64
+ "- args_model_utils.py\n",
65
+ ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n"
66
+ ]
67
+ },
68
+ {
69
+ "data": {
70
+ "application/vnd.jupyter.widget-view+json": {
71
+ "model_id": "b4c2fe2b94884fa5bc8e245a874ad21a",
72
+ "version_major": 2,
73
+ "version_minor": 0
74
+ },
75
+ "text/plain": [
76
+ "Downloading (…)nt_nugget_predict.py: 0%| | 0.00/10.7k [00:00<?, ?B/s]"
77
+ ]
78
+ },
79
+ "metadata": {},
80
+ "output_type": "display_data"
81
+ },
82
+ {
83
+ "data": {
84
+ "application/vnd.jupyter.widget-view+json": {
85
+ "model_id": "d192dbc95c6749b6a9c6a5c984c646fd",
86
+ "version_major": 2,
87
+ "version_minor": 0
88
+ },
89
+ "text/plain": [
90
+ "Downloading (…)ugget_model_utils.py: 0%| | 0.00/6.59k [00:00<?, ?B/s]"
91
+ ]
92
+ },
93
+ "metadata": {},
94
+ "output_type": "display_data"
95
+ },
96
+ {
97
+ "name": "stderr",
98
+ "output_type": "stream",
99
+ "text": [
100
+ "A new version of the following files was downloaded from https://huggingface.co/CyberPeace-Institute/Cybersecurity-Knowledge-Graph:\n",
101
+ "- nugget_model_utils.py\n",
102
+ ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n"
103
+ ]
104
+ },
105
+ {
106
+ "data": {
107
+ "application/vnd.jupyter.widget-view+json": {
108
+ "model_id": "0ca6f6c3195c4708bb1834a0c876e893",
109
+ "version_major": 2,
110
+ "version_minor": 0
111
+ },
112
+ "text/plain": [
113
+ "Downloading (…)esolve/main/utils.py: 0%| | 0.00/7.05k [00:00<?, ?B/s]"
114
+ ]
115
+ },
116
+ "metadata": {},
117
+ "output_type": "display_data"
118
+ },
119
+ {
120
+ "name": "stderr",
121
+ "output_type": "stream",
122
+ "text": [
123
+ "A new version of the following files was downloaded from https://huggingface.co/CyberPeace-Institute/Cybersecurity-Knowledge-Graph:\n",
124
+ "- utils.py\n",
125
+ ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n",
126
+ "A new version of the following files was downloaded from https://huggingface.co/CyberPeace-Institute/Cybersecurity-Knowledge-Graph:\n",
127
+ "- event_nugget_predict.py\n",
128
+ "- nugget_model_utils.py\n",
129
+ "- utils.py\n",
130
+ ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n",
131
+ "A new version of the following files was downloaded from https://huggingface.co/CyberPeace-Institute/Cybersecurity-Knowledge-Graph:\n",
132
+ "- event_arg_predict.py\n",
133
+ "- args_model_utils.py\n",
134
+ "- event_nugget_predict.py\n",
135
+ ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n"
136
+ ]
137
+ },
138
+ {
139
+ "data": {
140
+ "application/vnd.jupyter.widget-view+json": {
141
+ "model_id": "2776f9469d414ed6aaa5b594fae0c6a8",
142
+ "version_major": 2,
143
+ "version_minor": 0
144
+ },
145
+ "text/plain": [
146
+ "Downloading (…)nt_realis_predict.py: 0%| | 0.00/11.5k [00:00<?, ?B/s]"
147
+ ]
148
+ },
149
+ "metadata": {},
150
+ "output_type": "display_data"
151
+ },
152
+ {
153
+ "data": {
154
+ "application/vnd.jupyter.widget-view+json": {
155
+ "model_id": "22265c1ceb7d4611a5118fbde096bc1d",
156
+ "version_major": 2,
157
+ "version_minor": 0
158
+ },
159
+ "text/plain": [
160
+ "Downloading (…)ealis_model_utils.py: 0%| | 0.00/7.09k [00:00<?, ?B/s]"
161
+ ]
162
+ },
163
+ "metadata": {},
164
+ "output_type": "display_data"
165
+ },
166
+ {
167
+ "name": "stderr",
168
+ "output_type": "stream",
169
+ "text": [
170
+ "A new version of the following files was downloaded from https://huggingface.co/CyberPeace-Institute/Cybersecurity-Knowledge-Graph:\n",
171
+ "- realis_model_utils.py\n",
172
+ ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n",
173
+ "A new version of the following files was downloaded from https://huggingface.co/CyberPeace-Institute/Cybersecurity-Knowledge-Graph:\n",
174
+ "- event_realis_predict.py\n",
175
+ "- realis_model_utils.py\n",
176
+ ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n",
177
+ "A new version of the following files was downloaded from https://huggingface.co/CyberPeace-Institute/Cybersecurity-Knowledge-Graph:\n",
178
+ "- model.py\n",
179
+ "- event_arg_predict.py\n",
180
+ "- event_realis_predict.py\n",
181
+ ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n",
182
+ "Some weights of RobertaModel were not initialized from the model checkpoint at ehsanaghaei/SecureBERT and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']\n",
183
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
184
+ "Some weights of RobertaModel were not initialized from the model checkpoint at ehsanaghaei/SecureBERT and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']\n",
185
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
186
+ "Some weights of RobertaModel were not initialized from the model checkpoint at ehsanaghaei/SecureBERT and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']\n",
187
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
188
+ ]
189
+ }
190
+ ],
191
+ "source": [
192
+ "# Load model directly\n",
193
+ "from transformers import AutoModelForTokenClassification\n",
194
+ "model = AutoModelForTokenClassification.from_pretrained(\"CyberPeace-Institute/Cybersecurity-Knowledge-Graph\", trust_remote_code=True)"
195
+ ]
196
+ },
197
+ {
198
+ "cell_type": "code",
199
+ "execution_count": 2,
200
+ "metadata": {},
201
+ "outputs": [
202
+ {
203
+ "data": {
204
+ "application/vnd.jupyter.widget-view+json": {
205
+ "model_id": "e3e5818a4d6b4ba99243603a068622d0",
206
+ "version_major": 2,
207
+ "version_minor": 0
208
+ },
209
+ "text/plain": [
210
+ "Map: 0%| | 0/1 [00:00<?, ? examples/s]"
211
+ ]
212
+ },
213
+ "metadata": {},
214
+ "output_type": "display_data"
215
+ },
216
+ {
217
+ "data": {
218
+ "application/vnd.jupyter.widget-view+json": {
219
+ "model_id": "f3a24eb19f414ae3ad974b1461fc3e64",
220
+ "version_major": 2,
221
+ "version_minor": 0
222
+ },
223
+ "text/plain": [
224
+ "Map: 0%| | 0/1 [00:00<?, ? examples/s]"
225
+ ]
226
+ },
227
+ "metadata": {},
228
+ "output_type": "display_data"
229
+ },
230
+ {
231
+ "data": {
232
+ "application/vnd.jupyter.widget-view+json": {
233
+ "model_id": "3c3dd376f5614f739e5c0780af8b3a9b",
234
+ "version_major": 2,
235
+ "version_minor": 0
236
+ },
237
+ "text/plain": [
238
+ "Map: 0%| | 0/1 [00:00<?, ? examples/s]"
239
+ ]
240
+ },
241
+ "metadata": {},
242
+ "output_type": "display_data"
243
+ },
244
+ {
245
+ "data": {
246
+ "application/vnd.jupyter.widget-view+json": {
247
+ "model_id": "20c9cd31ea854bb58208249c18840e14",
248
+ "version_major": 2,
249
+ "version_minor": 0
250
+ },
251
+ "text/plain": [
252
+ "Map: 0%| | 0/1 [00:00<?, ? examples/s]"
253
+ ]
254
+ },
255
+ "metadata": {},
256
+ "output_type": "display_data"
257
+ },
258
+ {
259
+ "data": {
260
+ "application/vnd.jupyter.widget-view+json": {
261
+ "model_id": "a668052410cf41968f97ebdcf187debc",
262
+ "version_major": 2,
263
+ "version_minor": 0
264
+ },
265
+ "text/plain": [
266
+ "Map: 0%| | 0/1 [00:00<?, ? examples/s]"
267
+ ]
268
+ },
269
+ "metadata": {},
270
+ "output_type": "display_data"
271
+ },
272
+ {
273
+ "data": {
274
+ "text/plain": [
275
+ "[{'id': 1437, 'token': ' ', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
276
+ " {'id': 50118, 'token': '\\n', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
277
+ " {'id': 287, 'token': ' As', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
278
+ " {'id': 9, 'token': ' of', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
279
+ " {'id': 502, 'token': ' June', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
280
+ " {'id': 336, 'token': ' 2016', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
281
+ " {'id': 2156, 'token': ',', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
282
+ " {'id': 55, 'token': ' more', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
283
+ " {'id': 87, 'token': ' than', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
284
+ " {'id': 3982, 'token': ' 150', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
285
+ " {'id': 153,\n",
286
+ " 'token': ' million',\n",
287
+ " 'nugget': 'O',\n",
288
+ " 'argument': 'O',\n",
289
+ " 'realis': 'O'},\n",
290
+ " {'id': 2171,\n",
291
+ " 'token': ' active',\n",
292
+ " 'nugget': 'O',\n",
293
+ " 'argument': 'O',\n",
294
+ " 'realis': 'O'},\n",
295
+ " {'id': 1434,\n",
296
+ " 'token': ' users',\n",
297
+ " 'nugget': 'O',\n",
298
+ " 'argument': 'O',\n",
299
+ " 'realis': 'O'},\n",
300
+ " {'id': 10754,\n",
301
+ " 'token': ' interact',\n",
302
+ " 'nugget': 'O',\n",
303
+ " 'argument': 'O',\n",
304
+ " 'realis': 'O'},\n",
305
+ " {'id': 19, 'token': ' with', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
306
+ " {'id': 65, 'token': ' one', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
307
+ " {'id': 277,\n",
308
+ " 'token': ' another',\n",
309
+ " 'nugget': 'O',\n",
310
+ " 'argument': 'O',\n",
311
+ " 'realis': 'O'},\n",
312
+ " {'id': 1230,\n",
313
+ " 'token': ' daily',\n",
314
+ " 'nugget': 'O',\n",
315
+ " 'argument': 'O',\n",
316
+ " 'realis': 'O'},\n",
317
+ " {'id': 1241, 'token': ' via', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
318
+ " {'id': 11477,\n",
319
+ " 'token': ' Snapchat',\n",
320
+ " 'nugget': 'O',\n",
321
+ " 'argument': 'O',\n",
322
+ " 'realis': 'O'},\n",
323
+ " {'id': 479, 'token': '.', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
324
+ " {'id': 5763,\n",
325
+ " 'token': ' Others',\n",
326
+ " 'nugget': 'O',\n",
327
+ " 'argument': 'O',\n",
328
+ " 'realis': 'O'},\n",
329
+ " {'id': 32, 'token': ' are', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
330
+ " {'id': 4777,\n",
331
+ " 'token': ' drawn',\n",
332
+ " 'nugget': 'O',\n",
333
+ " 'argument': 'O',\n",
334
+ " 'realis': 'O'},\n",
335
+ " {'id': 30, 'token': ' by', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
336
+ " {'id': 5, 'token': ' the', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
337
+ " {'id': 544,\n",
338
+ " 'token': ' service',\n",
339
+ " 'nugget': 'O',\n",
340
+ " 'argument': 'O',\n",
341
+ " 'realis': 'O'},\n",
342
+ " {'id': 128, 'token': \" '\", 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
343
+ " {'id': 29, 'token': 's', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
344
+ " {'id': 55, 'token': ' more', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
345
+ " {'id': 485,\n",
346
+ " 'token': ' recent',\n",
347
+ " 'nugget': 'O',\n",
348
+ " 'argument': 'O',\n",
349
+ " 'realis': 'O'},\n",
350
+ " {'id': 1575,\n",
351
+ " 'token': ' features',\n",
352
+ " 'nugget': 'O',\n",
353
+ " 'argument': 'O',\n",
354
+ " 'realis': 'O'},\n",
355
+ " {'id': 479, 'token': '.', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
356
+ " {'id': 2246,\n",
357
+ " 'token': ' Those',\n",
358
+ " 'nugget': 'O',\n",
359
+ " 'argument': 'O',\n",
360
+ " 'realis': 'O'},\n",
361
+ " {'id': 680,\n",
362
+ " 'token': ' include',\n",
363
+ " 'nugget': 'O',\n",
364
+ " 'argument': 'O',\n",
365
+ " 'realis': 'O'},\n",
366
+ " {'id': 14100,\n",
367
+ " 'token': ' Snap',\n",
368
+ " 'nugget': 'O',\n",
369
+ " 'argument': 'O',\n",
370
+ " 'realis': 'O'},\n",
371
+ " {'id': 20122, 'token': 'cash', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
372
+ " {'id': 2156, 'token': ',', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
373
+ " {'id': 10, 'token': ' a', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
374
+ " {'id': 5448,\n",
375
+ " 'token': ' method',\n",
376
+ " 'nugget': 'O',\n",
377
+ " 'argument': 'O',\n",
378
+ " 'realis': 'O'},\n",
379
+ " {'id': 2942,\n",
380
+ " 'token': ' introduced',\n",
381
+ " 'nugget': 'O',\n",
382
+ " 'argument': 'O',\n",
383
+ " 'realis': 'O'},\n",
384
+ " {'id': 13, 'token': ' for', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
385
+ " {'id': 1434,\n",
386
+ " 'token': ' users',\n",
387
+ " 'nugget': 'O',\n",
388
+ " 'argument': 'O',\n",
389
+ " 'realis': 'O'},\n",
390
+ " {'id': 7, 'token': ' to', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
391
+ " {'id': 2142, 'token': ' send', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
392
+ " {'id': 1830,\n",
393
+ " 'token': ' mobile',\n",
394
+ " 'nugget': 'O',\n",
395
+ " 'argument': 'O',\n",
396
+ " 'realis': 'O'},\n",
397
+ " {'id': 3081,\n",
398
+ " 'token': ' payments',\n",
399
+ " 'nugget': 'O',\n",
400
+ " 'argument': 'O',\n",
401
+ " 'realis': 'O'},\n",
402
+ " {'id': 7, 'token': ' to', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
403
+ " {'id': 49, 'token': ' their', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
404
+ " {'id': 964,\n",
405
+ " 'token': ' friends',\n",
406
+ " 'nugget': 'O',\n",
407
+ " 'argument': 'O',\n",
408
+ " 'realis': 'O'},\n",
409
+ " {'id': 479, 'token': '.', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
410
+ " {'id': 1437, 'token': ' ', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
411
+ " {'id': 50118, 'token': '\\n', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
412
+ " {'id': 6211,\n",
413
+ " 'token': ' Given',\n",
414
+ " 'nugget': 'O',\n",
415
+ " 'argument': 'O',\n",
416
+ " 'realis': 'O'},\n",
417
+ " {'id': 5, 'token': ' the', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
418
+ " {'id': 1553, 'token': ' app', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
419
+ " {'id': 128, 'token': \" '\", 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
420
+ " {'id': 29, 'token': 's', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
421
+ " {'id': 7347,\n",
422
+ " 'token': ' popularity',\n",
423
+ " 'nugget': 'O',\n",
424
+ " 'argument': 'O',\n",
425
+ " 'realis': 'O'},\n",
426
+ " {'id': 2156, 'token': ',', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
427
+ " {'id': 24, 'token': ' it', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
428
+ " {'id': 128, 'token': \" '\", 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
429
+ " {'id': 29, 'token': 's', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
430
+ " {'id': 117, 'token': ' no', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
431
+ " {'id': 5170,\n",
432
+ " 'token': ' wonder',\n",
433
+ " 'nugget': 'O',\n",
434
+ " 'argument': 'O',\n",
435
+ " 'realis': 'O'},\n",
436
+ " {'id': 804,\n",
437
+ " 'token': ' online',\n",
438
+ " 'nugget': 'O',\n",
439
+ " 'argument': 'O',\n",
440
+ " 'realis': 'O'},\n",
441
+ " {'id': 9177,\n",
442
+ " 'token': ' criminals',\n",
443
+ " 'nugget': 'O',\n",
444
+ " 'argument': 'O',\n",
445
+ " 'realis': 'O'},\n",
446
+ " {'id': 33, 'token': ' have', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
447
+ " {'id': 278, 'token': ' set', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
448
+ " {'id': 49, 'token': ' their', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
449
+ " {'id': 579, 'token': ' s', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
450
+ " {'id': 6183, 'token': 'ights', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
451
+ " {'id': 15, 'token': ' on', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
452
+ " {'id': 11597,\n",
453
+ " 'token': ' hacking',\n",
454
+ " 'nugget': 'O',\n",
455
+ " 'argument': 'O',\n",
456
+ " 'realis': 'O'},\n",
457
+ " {'id': 1434,\n",
458
+ " 'token': ' users',\n",
459
+ " 'nugget': 'O',\n",
460
+ " 'argument': 'O',\n",
461
+ " 'realis': 'O'},\n",
462
+ " {'id': 128, 'token': \" '\", 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
463
+ " {'id': 11477,\n",
464
+ " 'token': ' Snapchat',\n",
465
+ " 'nugget': 'O',\n",
466
+ " 'argument': 'B-PII',\n",
467
+ " 'realis': 'O'},\n",
468
+ " {'id': 2349,\n",
469
+ " 'token': ' accounts',\n",
470
+ " 'nugget': 'O',\n",
471
+ " 'argument': 'I-PII',\n",
472
+ " 'realis': 'O'},\n",
473
+ " {'id': 479, 'token': '.', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
474
+ " {'id': 286, 'token': ' For', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
475
+ " {'id': 4327,\n",
476
+ " 'token': ' instance',\n",
477
+ " 'nugget': 'O',\n",
478
+ " 'argument': 'O',\n",
479
+ " 'realis': 'O'},\n",
480
+ " {'id': 2156, 'token': ',', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
481
+ " {'id': 124, 'token': ' back', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
482
+ " {'id': 11, 'token': ' in', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
483
+ " {'id': 628,\n",
484
+ " 'token': ' late',\n",
485
+ " 'nugget': 'O',\n",
486
+ " 'argument': 'B-Time',\n",
487
+ " 'realis': 'O'},\n",
488
+ " {'id': 1014,\n",
489
+ " 'token': ' 2013',\n",
490
+ " 'nugget': 'O',\n",
491
+ " 'argument': 'I-Time',\n",
492
+ " 'realis': 'O'},\n",
493
+ " {'id': 2156, 'token': ',', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
494
+ " {'id': 10,\n",
495
+ " 'token': ' a',\n",
496
+ " 'nugget': 'O',\n",
497
+ " 'argument': 'B-Person',\n",
498
+ " 'realis': 'O'},\n",
499
+ " {'id': 333,\n",
500
+ " 'token': ' group',\n",
501
+ " 'nugget': 'O',\n",
502
+ " 'argument': 'I-Person',\n",
503
+ " 'realis': 'O'},\n",
504
+ " {'id': 9,\n",
505
+ " 'token': ' of',\n",
506
+ " 'nugget': 'O',\n",
507
+ " 'argument': 'I-Person',\n",
508
+ " 'realis': 'O'},\n",
509
+ " {'id': 11344,\n",
510
+ " 'token': ' hackers',\n",
511
+ " 'nugget': 'O',\n",
512
+ " 'argument': 'I-Person',\n",
513
+ " 'realis': 'O'},\n",
514
+ " {'id': 1027,\n",
515
+ " 'token': ' published',\n",
516
+ " 'nugget': 'B-Databreach',\n",
517
+ " 'argument': 'O',\n",
518
+ " 'realis': 'Actual'},\n",
519
+ " {'id': 10, 'token': ' a', 'nugget': 'O', 'argument': 'B-Data', 'realis': 'O'},\n",
520
+ " {'id': 8503,\n",
521
+ " 'token': ' database',\n",
522
+ " 'nugget': 'O',\n",
523
+ " 'argument': 'I-Data',\n",
524
+ " 'realis': 'O'},\n",
525
+ " {'id': 8200,\n",
526
+ " 'token': ' containing',\n",
527
+ " 'nugget': 'O',\n",
528
+ " 'argument': 'O',\n",
529
+ " 'realis': 'O'},\n",
530
+ " {'id': 5, 'token': ' the', 'nugget': 'O', 'argument': 'B-PII', 'realis': 'O'},\n",
531
+ " {'id': 32200,\n",
532
+ " 'token': ' usernames',\n",
533
+ " 'nugget': 'O',\n",
534
+ " 'argument': 'I-PII',\n",
535
+ " 'realis': 'O'},\n",
536
+ " {'id': 8, 'token': ' and', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
537
+ " {'id': 1028,\n",
538
+ " 'token': ' phone',\n",
539
+ " 'nugget': 'O',\n",
540
+ " 'argument': 'B-PII',\n",
541
+ " 'realis': 'O'},\n",
542
+ " {'id': 1530,\n",
543
+ " 'token': ' numbers',\n",
544
+ " 'nugget': 'O',\n",
545
+ " 'argument': 'I-PII',\n",
546
+ " 'realis': 'O'},\n",
547
+ " {'id': 9, 'token': ' of', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
548
+ " {'id': 2219,\n",
549
+ " 'token': ' approximately',\n",
550
+ " 'nugget': 'O',\n",
551
+ " 'argument': 'O',\n",
552
+ " 'realis': 'O'},\n",
553
+ " {'id': 204, 'token': ' 4', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
554
+ " {'id': 4, 'token': '.', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
555
+ " {'id': 401, 'token': '6', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
556
+ " {'id': 153,\n",
557
+ " 'token': ' million',\n",
558
+ " 'nugget': 'O',\n",
559
+ " 'argument': 'O',\n",
560
+ " 'realis': 'O'},\n",
561
+ " {'id': 11477,\n",
562
+ " 'token': ' Snapchat',\n",
563
+ " 'nugget': 'O',\n",
564
+ " 'argument': 'B-Person',\n",
565
+ " 'realis': 'O'},\n",
566
+ " {'id': 1434,\n",
567
+ " 'token': ' users',\n",
568
+ " 'nugget': 'O',\n",
569
+ " 'argument': 'I-Person',\n",
570
+ " 'realis': 'O'},\n",
571
+ " {'id': 479, 'token': '.', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
572
+ " {'id': 1437, 'token': ' ', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
573
+ " {'id': 50118, 'token': '\\n', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
574
+ " {'id': 234, 'token': ' N', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
575
+ " {'id': 4550, 'token': 'ef', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
576
+ " {'id': 38313,\n",
577
+ " 'token': 'arious',\n",
578
+ " 'nugget': 'O',\n",
579
+ " 'argument': 'O',\n",
580
+ " 'realis': 'O'},\n",
581
+ " {'id': 2172,\n",
582
+ " 'token': ' individuals',\n",
583
+ " 'nugget': 'O',\n",
584
+ " 'argument': 'O',\n",
585
+ " 'realis': 'O'},\n",
586
+ " {'id': 115, 'token': ' could', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
587
+ " {'id': 33, 'token': ' have', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
588
+ " {'id': 341, 'token': ' used', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
589
+ " {'id': 14, 'token': ' that', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
590
+ " {'id': 335,\n",
591
+ " 'token': ' information',\n",
592
+ " 'nugget': 'O',\n",
593
+ " 'argument': 'O',\n",
594
+ " 'realis': 'O'},\n",
595
+ " {'id': 7, 'token': ' to', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
596
+ " {'id': 4392,\n",
597
+ " 'token': ' profile',\n",
598
+ " 'nugget': 'O',\n",
599
+ " 'argument': 'O',\n",
600
+ " 'realis': 'O'},\n",
601
+ " {'id': 3247,\n",
602
+ " 'token': ' targets',\n",
603
+ " 'nugget': 'O',\n",
604
+ " 'argument': 'O',\n",
605
+ " 'realis': 'O'},\n",
606
+ " {'id': 420,\n",
607
+ " 'token': ' across',\n",
608
+ " 'nugget': 'O',\n",
609
+ " 'argument': 'O',\n",
610
+ " 'realis': 'O'},\n",
611
+ " {'id': 1533,\n",
612
+ " 'token': ' multiple',\n",
613
+ " 'nugget': 'O',\n",
614
+ " 'argument': 'O',\n",
615
+ " 'realis': 'O'},\n",
616
+ " {'id': 3748, 'token': ' web', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
617
+ " {'id': 2349,\n",
618
+ " 'token': ' accounts',\n",
619
+ " 'nugget': 'O',\n",
620
+ " 'argument': 'O',\n",
621
+ " 'realis': 'O'},\n",
622
+ " {'id': 479, 'token': '.', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
623
+ " {'id': 166, 'token': ' We', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
624
+ " {'id': 67, 'token': ' also', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
625
+ " {'id': 6056, 'token': ' ca', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
626
+ " {'id': 295, 'token': ' n', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
627
+ " {'id': 75, 'token': \"'t\", 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
628
+ " {'id': 4309,\n",
629
+ " 'token': ' forget',\n",
630
+ " 'nugget': 'O',\n",
631
+ " 'argument': 'O',\n",
632
+ " 'realis': 'O'},\n",
633
+ " {'id': 59, 'token': ' about', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
634
+ " {'id': 5, 'token': ' the', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
635
+ " {'id': 573,\n",
636
+ " 'token': ' security',\n",
637
+ " 'nugget': 'O',\n",
638
+ " 'argument': 'O',\n",
639
+ " 'realis': 'O'},\n",
640
+ " {'id': 1160,\n",
641
+ " 'token': ' incident',\n",
642
+ " 'nugget': 'O',\n",
643
+ " 'argument': 'O',\n",
644
+ " 'realis': 'O'},\n",
645
+ " {'id': 14, 'token': ' that', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
646
+ " {'id': 2756,\n",
647
+ " 'token': ' occurred',\n",
648
+ " 'nugget': 'O',\n",
649
+ " 'argument': 'O',\n",
650
+ " 'realis': 'O'},\n",
651
+ " {'id': 124, 'token': ' back', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
652
+ " {'id': 11, 'token': ' in', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
653
+ " {'id': 902,\n",
654
+ " 'token': ' February',\n",
655
+ " 'nugget': 'O',\n",
656
+ " 'argument': 'B-Time',\n",
657
+ " 'realis': 'O'},\n",
658
+ " {'id': 336,\n",
659
+ " 'token': ' 2016',\n",
660
+ " 'nugget': 'O',\n",
661
+ " 'argument': 'I-Time',\n",
662
+ " 'realis': 'O'},\n",
663
+ " {'id': 479, 'token': '.', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
664
+ " {'id': 96, 'token': ' In', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
665
+ " {'id': 14,\n",
666
+ " 'token': ' that',\n",
667
+ " 'nugget': 'B-Phishing',\n",
668
+ " 'argument': 'O',\n",
669
+ " 'realis': 'O'},\n",
670
+ " {'id': 908,\n",
671
+ " 'token': ' attack',\n",
672
+ " 'nugget': 'I-Phishing',\n",
673
+ " 'argument': 'O',\n",
674
+ " 'realis': 'Actual'},\n",
675
+ " {'id': 2156, 'token': ',', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
676
+ " {'id': 951,\n",
677
+ " 'token': ' someone',\n",
678
+ " 'nugget': 'O',\n",
679
+ " 'argument': 'B-Person',\n",
680
+ " 'realis': 'O'},\n",
681
+ " {'id': 7444,\n",
682
+ " 'token': ' posed',\n",
683
+ " 'nugget': 'B-Phishing',\n",
684
+ " 'argument': 'O',\n",
685
+ " 'realis': 'Actual'},\n",
686
+ " {'id': 25,\n",
687
+ " 'token': ' as',\n",
688
+ " 'nugget': 'I-Phishing',\n",
689
+ " 'argument': 'O',\n",
690
+ " 'realis': 'Actual'},\n",
691
+ " {'id': 5, 'token': ' the', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
692
+ " {'id': 138,\n",
693
+ " 'token': ' company',\n",
694
+ " 'nugget': 'O',\n",
695
+ " 'argument': 'B-Organization',\n",
696
+ " 'realis': 'O'},\n",
697
+ " {'id': 128, 'token': \" '\", 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
698
+ " {'id': 29, 'token': 's', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
699
+ " {'id': 1324,\n",
700
+ " 'token': ' CEO',\n",
701
+ " 'nugget': 'O',\n",
702
+ " 'argument': 'B-Person',\n",
703
+ " 'realis': 'O'},\n",
704
+ " {'id': 8, 'token': ' and', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
705
+ " {'id': 7013,\n",
706
+ " 'token': ' convinced',\n",
707
+ " 'nugget': 'B-Phishing',\n",
708
+ " 'argument': 'O',\n",
709
+ " 'realis': 'Actual'},\n",
710
+ " {'id': 10, 'token': ' a', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
711
+ " {'id': 11477,\n",
712
+ " 'token': ' Snapchat',\n",
713
+ " 'nugget': 'O',\n",
714
+ " 'argument': 'B-Person',\n",
715
+ " 'realis': 'O'},\n",
716
+ " {'id': 3200,\n",
717
+ " 'token': ' employee',\n",
718
+ " 'nugget': 'O',\n",
719
+ " 'argument': 'I-Person',\n",
720
+ " 'realis': 'O'},\n",
721
+ " {'id': 7, 'token': ' to', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
722
+ " {'id': 2142,\n",
723
+ " 'token': ' send',\n",
724
+ " 'nugget': 'O',\n",
725
+ " 'argument': 'B-Purpose',\n",
726
+ " 'realis': 'O'},\n",
727
+ " {'id': 81,\n",
728
+ " 'token': ' over',\n",
729
+ " 'nugget': 'O',\n",
730
+ " 'argument': 'I-Purpose',\n",
731
+ " 'realis': 'O'},\n",
732
+ " {'id': 10984,\n",
733
+ " 'token': ' payroll',\n",
734
+ " 'nugget': 'O',\n",
735
+ " 'argument': 'I-Purpose',\n",
736
+ " 'realis': 'O'},\n",
737
+ " {'id': 335,\n",
738
+ " 'token': ' information',\n",
739
+ " 'nugget': 'O',\n",
740
+ " 'argument': 'I-Purpose',\n",
741
+ " 'realis': 'O'},\n",
742
+ " {'id': 479, 'token': '.', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
743
+ " {'id': 1437, 'token': ' ', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
744
+ " {'id': 50118, 'token': '\\n', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
745
+ " {'id': 20,\n",
746
+ " 'token': ' The',\n",
747
+ " 'nugget': 'B-Phishing',\n",
748
+ " 'argument': 'O',\n",
749
+ " 'realis': 'Actual'},\n",
750
+ " {'id': 1800,\n",
751
+ " 'token': ' successful',\n",
752
+ " 'nugget': 'I-Phishing',\n",
753
+ " 'argument': 'O',\n",
754
+ " 'realis': 'Actual'},\n",
755
+ " {'id': 28237,\n",
756
+ " 'token': ' phish',\n",
757
+ " 'nugget': 'I-Phishing',\n",
758
+ " 'argument': 'O',\n",
759
+ " 'realis': 'Actual'},\n",
760
+ " {'id': 3284,\n",
761
+ " 'token': ' ultimately',\n",
762
+ " 'nugget': 'O',\n",
763
+ " 'argument': 'O',\n",
764
+ " 'realis': 'O'},\n",
765
+ " {'id': 13969,\n",
766
+ " 'token': ' compromised',\n",
767
+ " 'nugget': 'B-Databreach',\n",
768
+ " 'argument': 'O',\n",
769
+ " 'realis': 'Actual'},\n",
770
+ " {'id': 4295,\n",
771
+ " 'token': ' dozens',\n",
772
+ " 'nugget': 'O',\n",
773
+ " 'argument': 'O',\n",
774
+ " 'realis': 'O'},\n",
775
+ " {'id': 9, 'token': ' of', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
776
+ " {'id': 1321,\n",
777
+ " 'token': ' employees',\n",
778
+ " 'nugget': 'O',\n",
779
+ " 'argument': 'B-Person',\n",
780
+ " 'realis': 'O'},\n",
781
+ " {'id': 128, 'token': \" '\", 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
782
+ " {'id': 14875,\n",
783
+ " 'token': ' identities',\n",
784
+ " 'nugget': 'O',\n",
785
+ " 'argument': 'B-PII',\n",
786
+ " 'realis': 'O'},\n",
787
+ " {'id': 479, 'token': '.', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
788
+ " {'id': 598, 'token': ' To', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
789
+ " {'id': 28, 'token': ' be', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
790
+ " {'id': 2105, 'token': ' fair', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
791
+ " {'id': 2156, 'token': ',', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
792
+ " {'id': 10, 'token': ' a', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
793
+ " {'id': 14251,\n",
794
+ " 'token': ' mega',\n",
795
+ " 'nugget': 'O',\n",
796
+ " 'argument': 'O',\n",
797
+ " 'realis': 'O'},\n",
798
+ " {'id': 6999,\n",
799
+ " 'token': ' breach',\n",
800
+ " 'nugget': 'O',\n",
801
+ " 'argument': 'O',\n",
802
+ " 'realis': 'O'},\n",
803
+ " {'id': 15, 'token': ' on', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
804
+ " {'id': 5, 'token': ' the', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
805
+ " {'id': 3189,\n",
806
+ " 'token': ' scale',\n",
807
+ " 'nugget': 'O',\n",
808
+ " 'argument': 'O',\n",
809
+ " 'realis': 'O'},\n",
810
+ " {'id': 9, 'token': ' of', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
811
+ " {'id': 99, 'token': ' what', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
812
+ " {'id': 2132,\n",
813
+ " 'token': ' affected',\n",
814
+ " 'nugget': 'O',\n",
815
+ " 'argument': 'O',\n",
816
+ " 'realis': 'O'},\n",
817
+ " {'id': 6582,\n",
818
+ " 'token': ' LinkedIn',\n",
819
+ " 'nugget': 'O',\n",
820
+ " 'argument': 'O',\n",
821
+ " 'realis': 'O'},\n",
822
+ " {'id': 2156, 'token': ',', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
823
+ " {'id': 9494,\n",
824
+ " 'token': ' Tumblr',\n",
825
+ " 'nugget': 'O',\n",
826
+ " 'argument': 'O',\n",
827
+ " 'realis': 'O'},\n",
828
+ " {'id': 2156, 'token': ',', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
829
+ " {'id': 8, 'token': ' and', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
830
+ " {'id': 10354,\n",
831
+ " 'token': ' Yahoo',\n",
832
+ " 'nugget': 'O',\n",
833
+ " 'argument': 'O',\n",
834
+ " 'realis': 'O'},\n",
835
+ " {'id': 34, 'token': ' has', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
836
+ " {'id': 648, 'token': ' yet', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
837
+ " {'id': 7, 'token': ' to', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
838
+ " {'id': 2506,\n",
839
+ " 'token': ' strike',\n",
840
+ " 'nugget': 'O',\n",
841
+ " 'argument': 'O',\n",
842
+ " 'realis': 'O'},\n",
843
+ " {'id': 5, 'token': ' the', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
844
+ " {'id': 11203,\n",
845
+ " 'token': ' messaging',\n",
846
+ " 'nugget': 'O',\n",
847
+ " 'argument': 'O',\n",
848
+ " 'realis': 'O'},\n",
849
+ " {'id': 1553, 'token': ' app', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
850
+ " {'id': 479, 'token': '.', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
851
+ " {'id': 125, 'token': ' But', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
852
+ " {'id': 14, 'token': ' that', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
853
+ " {'id': 128, 'token': \" '\", 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
854
+ " {'id': 29, 'token': 's', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
855
+ " {'id': 45, 'token': ' not', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
856
+ " {'id': 7, 'token': ' to', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
857
+ " {'id': 224, 'token': ' say', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
858
+ " {'id': 9177,\n",
859
+ " 'token': ' criminals',\n",
860
+ " 'nugget': 'O',\n",
861
+ " 'argument': 'O',\n",
862
+ " 'realis': 'O'},\n",
863
+ " {'id': 32, 'token': ' are', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
864
+ " {'id': 295, 'token': ' n', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
865
+ " {'id': 75, 'token': \"'t\", 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
866
+ " {'id': 667,\n",
867
+ " 'token': ' trying',\n",
868
+ " 'nugget': 'O',\n",
869
+ " 'argument': 'O',\n",
870
+ " 'realis': 'O'},\n",
871
+ " {'id': 7, 'token': ' to', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
872
+ " {'id': 465, 'token': ' find', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
873
+ " {'id': 10, 'token': ' a', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
874
+ " {'id': 169, 'token': ' way', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
875
+ " {'id': 88, 'token': ' into', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
876
+ " {'id': 82, 'token': ' people', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
877
+ " {'id': 128, 'token': \" '\", 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
878
+ " {'id': 29, 'token': 's', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
879
+ " {'id': 2349,\n",
880
+ " 'token': ' accounts',\n",
881
+ " 'nugget': 'O',\n",
882
+ " 'argument': 'O',\n",
883
+ " 'realis': 'O'},\n",
884
+ " {'id': 479, 'token': '.', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
885
+ " {'id': 1437, 'token': ' ', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
886
+ " {'id': 50118, 'token': '\\n', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
887
+ " {'id': 46450,\n",
888
+ " 'token': ' Hackers',\n",
889
+ " 'nugget': 'O',\n",
890
+ " 'argument': 'O',\n",
891
+ " 'realis': 'O'},\n",
892
+ " {'id': 2563,\n",
893
+ " 'token': ' clearly',\n",
894
+ " 'nugget': 'O',\n",
895
+ " 'argument': 'O',\n",
896
+ " 'realis': 'O'},\n",
897
+ " {'id': 33, 'token': ' have', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
898
+ " {'id': 11477,\n",
899
+ " 'token': ' Snapchat',\n",
900
+ " 'nugget': 'O',\n",
901
+ " 'argument': 'O',\n",
902
+ " 'realis': 'O'},\n",
903
+ " {'id': 11, 'token': ' in', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
904
+ " {'id': 49, 'token': ' their', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
905
+ " {'id': 579, 'token': ' s', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
906
+ " {'id': 6183, 'token': 'ights', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
907
+ " {'id': 2156, 'token': ',', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
908
+ " {'id': 61, 'token': ' which', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
909
+ " {'id': 16, 'token': ' is', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
910
+ " {'id': 596, 'token': ' why', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
911
+ " {'id': 1434,\n",
912
+ " 'token': ' users',\n",
913
+ " 'nugget': 'O',\n",
914
+ " 'argument': 'O',\n",
915
+ " 'realis': 'O'},\n",
916
+ " {'id': 240, 'token': ' need', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
917
+ " {'id': 7, 'token': ' to', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
918
+ " {'id': 1532,\n",
919
+ " 'token': ' learn',\n",
920
+ " 'nugget': 'O',\n",
921
+ " 'argument': 'O',\n",
922
+ " 'realis': 'O'},\n",
923
+ " {'id': 141, 'token': ' how', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
924
+ " {'id': 7, 'token': ' to', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
925
+ " {'id': 1514, 'token': ' spot', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
926
+ " {'id': 5, 'token': ' the', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
927
+ " {'id': 2892,\n",
928
+ " 'token': ' warning',\n",
929
+ " 'nugget': 'O',\n",
930
+ " 'argument': 'O',\n",
931
+ " 'realis': 'O'},\n",
932
+ " {'id': 2434,\n",
933
+ " 'token': ' signs',\n",
934
+ " 'nugget': 'O',\n",
935
+ " 'argument': 'O',\n",
936
+ " 'realis': 'O'},\n",
937
+ " {'id': 9, 'token': ' of', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
938
+ " {'id': 10, 'token': ' a', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
939
+ " {'id': 14157,\n",
940
+ " 'token': ' hack',\n",
941
+ " 'nugget': 'O',\n",
942
+ " 'argument': 'O',\n",
943
+ " 'realis': 'O'},\n",
944
+ " {'id': 8, 'token': ' and', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
945
+ " {'id': 141, 'token': ' how', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
946
+ " {'id': 51, 'token': ' they', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
947
+ " {'id': 64, 'token': ' can', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
948
+ " {'id': 5312,\n",
949
+ " 'token': ' recover',\n",
950
+ " 'nugget': 'O',\n",
951
+ " 'argument': 'O',\n",
952
+ " 'realis': 'O'},\n",
953
+ " {'id': 49, 'token': ' their', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
954
+ " {'id': 2349,\n",
955
+ " 'token': ' accounts',\n",
956
+ " 'nugget': 'O',\n",
957
+ " 'argument': 'O',\n",
958
+ " 'realis': 'O'},\n",
959
+ " {'id': 114, 'token': ' if', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
960
+ " {'id': 951,\n",
961
+ " 'token': ' someone',\n",
962
+ " 'nugget': 'O',\n",
963
+ " 'argument': 'O',\n",
964
+ " 'realis': 'O'},\n",
965
+ " {'id': 30478,\n",
966
+ " 'token': ' compromises',\n",
967
+ " 'nugget': 'O',\n",
968
+ " 'argument': 'O',\n",
969
+ " 'realis': 'O'},\n",
970
+ " {'id': 106, 'token': ' them', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
971
+ " {'id': 479, 'token': '.', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
972
+ " {'id': 1437, 'token': ' ', 'nugget': 'O', 'argument': 'O', 'realis': 'O'},\n",
973
+ " {'id': 50118, 'token': '\\n', 'nugget': 'O', 'argument': 'O', 'realis': 'O'}]"
974
+ ]
975
+ },
976
+ "execution_count": 2,
977
+ "metadata": {},
978
+ "output_type": "execute_result"
979
+ }
980
+ ],
981
+ "source": [
982
+ "input = \"\"\"\n",
983
+ "As of June 2016, more than 150 million active users interact with one another daily via Snapchat. Others are drawn by the service's more recent features. Those include Snapcash, a method introduced for users to send mobile payments to their friends. \n",
984
+ "Given the app's popularity, it's no wonder online criminals have set their sights on hacking users' Snapchat accounts. For instance, back in late 2013, a group of hackers published a database containing the usernames and phone numbers of approximately 4.6 million Snapchat users. \n",
985
+ "Nefarious individuals could have used that information to profile targets across multiple web accounts. We also can't forget about the security incident that occurred back in February 2016. In that attack, someone posed as the company's CEO and convinced a Snapchat employee to send over payroll information. \n",
986
+ "The successful phish ultimately compromised dozens of employees' identities. To be fair, a mega breach on the scale of what affected LinkedIn, Tumblr, and Yahoo has yet to strike the messaging app. But that's not to say criminals aren't trying to find a way into people's accounts. \n",
987
+ "Hackers clearly have Snapchat in their sights, which is why users need to learn how to spot the warning signs of a hack and how they can recover their accounts if someone compromises them.\n",
988
+ "\"\"\"\n",
989
+ "\n",
990
+ "model(input)"
991
+ ]
992
+ },
993
+ {
994
+ "cell_type": "code",
995
+ "execution_count": null,
996
+ "metadata": {},
997
+ "outputs": [],
998
+ "source": [
999
+ "model.forward()"
1000
+ ]
1001
+ },
1002
+ {
1003
+ "cell_type": "code",
1004
+ "execution_count": null,
1005
+ "metadata": {},
1006
+ "outputs": [],
1007
+ "source": []
1008
+ }
1009
+ ],
1010
+ "metadata": {
1011
+ "kernelspec": {
1012
+ "display_name": "Python 3",
1013
+ "language": "python",
1014
+ "name": "python3"
1015
+ },
1016
+ "language_info": {
1017
+ "codemirror_mode": {
1018
+ "name": "ipython",
1019
+ "version": 3
1020
+ },
1021
+ "file_extension": ".py",
1022
+ "mimetype": "text/x-python",
1023
+ "name": "python",
1024
+ "nbconvert_exporter": "python",
1025
+ "pygments_lexer": "ipython3",
1026
+ "version": "3.9.15"
1027
+ },
1028
+ "orig_nbformat": 4
1029
+ },
1030
+ "nbformat": 4,
1031
+ "nbformat_minor": 2
1032
+ }
utils.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ list_of_pos_tags = [
2
+ "ADJ",
3
+ "ADP",
4
+ "ADV",
5
+ "AUX",
6
+ "CCONJ",
7
+ "DET",
8
+ "INTJ",
9
+ "NOUN",
10
+ "NUM",
11
+ "PART",
12
+ "PRON",
13
+ "PROPN",
14
+ "PUNCT",
15
+ "SCONJ",
16
+ "SYM",
17
+ "VERB",
18
+ "X"
19
+ ]
20
+
21
+ realis_list = ["O",
22
+ "Generic",
23
+ "Other",
24
+ "Actual"
25
+ ]
26
+
27
+
28
+ event_args_list = ['O',
29
+ 'B-System',
30
+ 'I-System',
31
+ 'B-Organization',
32
+ 'B-Money',
33
+ 'I-Money',
34
+ 'B-Device',
35
+ 'B-Person',
36
+ 'I-Person',
37
+ 'B-Vulnerability',
38
+ 'I-Vulnerability',
39
+ 'B-Capabilities',
40
+ 'I-Capabilities',
41
+ 'I-Organization',
42
+ 'B-PaymentMethod',
43
+ 'I-PaymentMethod',
44
+ 'B-Data',
45
+ 'I-Data',
46
+ 'B-Number',
47
+ 'I-Number',
48
+ 'B-Malware',
49
+ 'I-Malware',
50
+ 'B-PII',
51
+ 'I-PII',
52
+ 'B-CVE',
53
+ 'I-CVE',
54
+ 'B-Purpose',
55
+ 'I-Purpose',
56
+ 'B-File',
57
+ 'I-File',
58
+ 'I-Device',
59
+ 'B-Time',
60
+ 'I-Time',
61
+ 'B-Software',
62
+ 'I-Software',
63
+ 'B-Patch',
64
+ 'I-Patch',
65
+ 'B-Version',
66
+ 'I-Version',
67
+ 'B-Website',
68
+ 'I-Website',
69
+ 'B-GPE',
70
+ 'I-GPE'
71
+ ]
72
+
73
+ event_nugget_list = ['O',
74
+ 'B-Ransom',
75
+ 'I-Ransom',
76
+ 'B-DiscoverVulnerability',
77
+ 'I-DiscoverVulnerability',
78
+ 'B-PatchVulnerability',
79
+ 'I-PatchVulnerability',
80
+ 'B-Databreach',
81
+ 'I-Databreach',
82
+ 'B-Phishing',
83
+ 'I-Phishing'
84
+ ]
85
+
86
+ arg_2_role = {
87
+ "File" : ['Tool', 'Trusted-Entity'],
88
+ "Person" : ['Victim', 'Attacker', 'Discoverer', 'Releaser', 'Trusted-Entity', 'Vulnerable_System_Owner'],
89
+ "Capabilities" : ['Attack-Pattern', 'Capabilities', 'Issues-Addressed'],
90
+ "Purpose" : ['Purpose'],
91
+ "Time" : ['Time'],
92
+ "PII" : ['Compromised-Data', 'Trusted-Entity'],
93
+ "Data" : ['Compromised-Data', 'Trusted-Entity'],
94
+ "Organization" : ['Victim', 'Releaser', 'Discoverer', 'Attacker', 'Vulnerable_System_Owner', 'Trusted-Entity'],
95
+ "Patch" : ['Patch'],
96
+ "Software" : ['Vulnerable_System', 'Victim', 'Trusted-Entity', 'Supported_Platform'],
97
+ "Vulnerability" : ['Vulnerability'],
98
+ "Version" : ['Patch-Number', 'Vulnerable_System_Version'],
99
+ "Device" : ['Vulnerable_System', 'Victim', 'Supported_Platform'],
100
+ "CVE" : ['CVE'],
101
+ "Number" : ['Number-of-Data', 'Number-of-Victim'],
102
+ "System" : ['Victim', 'Supported_Platform', 'Vulnerable_System', 'Trusted-Entity'],
103
+ "Malware" : ['Tool'],
104
+ "Money" : ['Price', 'Damage-Amount'],
105
+ "PaymentMethod" : ['Payment-Method'],
106
+ "GPE" : ['Place'],
107
+ "Website" : ['Trusted-Entity', 'Tool', 'Vulnerable_System', 'Victim', 'Supported_Platform'],
108
+ }
109
+
110
+ def get_content(data):
111
+ return data["content"]
112
+
113
+ def get_event_nugget(data):
114
+ return [
115
+ {"nugget" : event["nugget"], "type" : event["type"], "subtype" : event["subtype"], "realis" : event["realis"]}
116
+ for hopper in data["cyberevent"]["hopper"] for event in hopper["events"]
117
+ ]
118
+ def get_event_args(data):
119
+ events = [event for hopper in data["cyberevent"]["hopper"] for event in hopper["events"]]
120
+ args = []
121
+ for event in events:
122
+ if "argument" in event.keys():
123
+ args.extend(event["argument"])
124
+ return args
125
+
126
+ def get_idxs_from_text(text, text_tokenized):
127
+ rest_text = text
128
+ last_idx = 0
129
+ result_dict = []
130
+
131
+ for substring in text_tokenized:
132
+ index = rest_text.find(substring)
133
+ result_dict.append(
134
+ {
135
+ "word" : substring,
136
+ "start_idx" : last_idx + index,
137
+ "end_idx" : last_idx + index + len(substring)
138
+ }
139
+ )
140
+ rest_text = rest_text[index + len(substring) : ]
141
+ last_idx += index + len(substring)
142
+ return result_dict
143
+
144
+ def get_entity_from_idx(start_idx, end_idx, event_nuggets):
145
+ event_nuggets_idxs = [(nugget["nugget"]["startOffset"], nugget["nugget"]["endOffset"]) for nugget in event_nuggets]
146
+ for idx, (nugget_start, nugget_end) in enumerate(event_nuggets_idxs):
147
+ if (start_idx == nugget_start and end_idx == nugget_end) or (start_idx == nugget_start and end_idx <= nugget_end) or (start_idx == nugget_start and end_idx > nugget_end) or (end_idx == nugget_end and start_idx < nugget_start) or (start_idx <= nugget_start and end_idx <= nugget_end and end_idx > nugget_start):
148
+ return "B-" + event_nuggets[idx]["subtype"]
149
+ elif (start_idx > nugget_start and end_idx <= nugget_end) or (start_idx > nugget_start and start_idx < nugget_end):
150
+ return "I-" + event_nuggets[idx]["subtype"]
151
+ return "O"
152
+
153
+ def get_entity_and_realis_from_idx(start_idx, end_idx, event_nuggets):
154
+ event_nuggets_idxs = [(nugget["nugget"]["startOffset"], nugget["nugget"]["endOffset"]) for nugget in event_nuggets]
155
+ for idx, (nugget_start, nugget_end) in enumerate(event_nuggets_idxs):
156
+ if (start_idx == nugget_start and end_idx == nugget_end) or (start_idx == nugget_start and end_idx <= nugget_end) or (start_idx == nugget_start and end_idx > nugget_end) or (end_idx == nugget_end and start_idx < nugget_start) or (start_idx <= nugget_start and end_idx <= nugget_end and end_idx > nugget_start):
157
+ return "B-" + event_nuggets[idx]["subtype"], "B-" + event_nuggets[idx]["realis"]
158
+ elif (start_idx > nugget_start and end_idx <= nugget_end) or (start_idx > nugget_start and start_idx < nugget_end):
159
+ return "I-" + event_nuggets[idx]["subtype"], "I-" + event_nuggets[idx]["realis"]
160
+ return "O", "O"
161
+
162
+ def get_args_entity_from_idx(start_idx, end_idx, event_args):
163
+ event_nuggets_idxs = [(nugget["startOffset"], nugget["endOffset"]) for nugget in event_args]
164
+ for idx, (nugget_start, nugget_end) in enumerate(event_nuggets_idxs):
165
+ if (start_idx == nugget_start and end_idx == nugget_end) or (start_idx == nugget_start and end_idx <= nugget_end) or (start_idx == nugget_start and end_idx > nugget_end) or (end_idx == nugget_end and start_idx < nugget_start) or (start_idx <= nugget_start and end_idx <= nugget_end and end_idx > nugget_start):
166
+ return "B-" + event_args[idx]["type"]
167
+ elif (start_idx > nugget_start and end_idx <= nugget_end) or (start_idx > nugget_start and start_idx < nugget_end):
168
+ return "I-" + event_args[idx]["type"]
169
+ return "O"
170
+
171
+ def split_with_character(string, char):
172
+ result = []
173
+ start = 0
174
+ for i, c in enumerate(string):
175
+ if c == char:
176
+ result.append(string[start:i])
177
+ result.append(char)
178
+ start = i + 1
179
+ result.append(string[start:])
180
+ return [x for x in result if x != '']
181
+
182
+ def extend_list_with_character(content_list, character):
183
+ content_as_words = []
184
+ for word in content_list:
185
+ if character in word:
186
+ split_list = split_with_character(word, character)
187
+ content_as_words.extend(split_list)
188
+ else:
189
+ content_as_words.append(word)
190
+ return content_as_words
191
+
192
+ def find_dict_by_overlap(list_of_dicts, key_value_pairs):
193
+ for dictionary in list_of_dicts:
194
+ if max(dictionary["start"], dictionary["end"]) >= min(key_value_pairs["start"], key_value_pairs["end"]) and max(key_value_pairs["start"], key_value_pairs["end"]) >= min(dictionary["start"], dictionary["end"]):
195
+ return dictionary
196
+ return None