Create all_datasets.py
Browse files- all_datasets.py +169 -0
all_datasets.py
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from imports import *
|
2 |
+
from utils import normalize, replace_all
|
3 |
+
|
4 |
+
class NerFeatures(object):
|
5 |
+
def __init__(self, input_ids, token_type_ids, attention_mask, valid_ids, labels, label_masks):
|
6 |
+
self.input_ids = torch.as_tensor(input_ids, dtype=torch.long)
|
7 |
+
self.labels = torch.as_tensor(labels, dtype=torch.long)
|
8 |
+
self.token_type_ids = torch.as_tensor(token_type_ids, dtype=torch.long)
|
9 |
+
self.attention_mask = torch.as_tensor(attention_mask, dtype=torch.long)
|
10 |
+
self.valid_ids = torch.as_tensor(valid_ids, dtype=torch.long)
|
11 |
+
self.label_masks = torch.as_tensor(label_masks, dtype=torch.long)
|
12 |
+
|
13 |
+
class NerOutput(OrderedDict):
|
14 |
+
loss: Optional[torch.FloatTensor] = torch.FloatTensor([0.0])
|
15 |
+
tags: Optional[List[int]] = []
|
16 |
+
cls_metrics: Optional[List[int]] = []
|
17 |
+
def __getitem__(self, k):
|
18 |
+
if isinstance(k, str):
|
19 |
+
inner_dict = {k: v for (k, v) in self.items()}
|
20 |
+
return inner_dict[k]
|
21 |
+
else:
|
22 |
+
return self.to_tuple()[k]
|
23 |
+
def __setattr__(self, name, value):
|
24 |
+
if name in self.keys() and value is not None:
|
25 |
+
super().__setitem__(name, value)
|
26 |
+
super().__setattr__(name, value)
|
27 |
+
def __setitem__(self, key, value):
|
28 |
+
super().__setitem__(key, value)
|
29 |
+
super().__setattr__(key, value)
|
30 |
+
def to_tuple(self) -> Tuple[Any]:
|
31 |
+
return tuple(self[k] for k in self.keys())
|
32 |
+
|
33 |
+
class NerDataset(Dataset):
|
34 |
+
def __init__(self, features: List[NerFeatures], device: str = 'cpu'):
|
35 |
+
self.examples = features
|
36 |
+
self.device = device
|
37 |
+
|
38 |
+
def __len__(self):
|
39 |
+
return len(self.examples)
|
40 |
+
|
41 |
+
def __getitem__(self, index):
|
42 |
+
return {key: val.to(self.device) for key, val in self.examples[index].__dict__.items()}
|
43 |
+
|
44 |
+
# return sentiment dataset at tensor type
|
45 |
+
def sentiment_dataset(path_folder, train_file_name, test_file_name):
|
46 |
+
def extract(path):
|
47 |
+
data = pd.read_csv(os.path.join(path), encoding="utf-8").dropna()
|
48 |
+
label = [np.argmax(i) for i in data[["negative", "positive", "neutral"]].values.astype(float)]
|
49 |
+
# text = data["text"].apply(lambda x: x.replace("_"," "))
|
50 |
+
text = data["text"]#.apply(lambda x: normalize(x))
|
51 |
+
return text, label
|
52 |
+
x_train, y_train = extract(os.path.join(path_folder, train_file_name))
|
53 |
+
x_test, y_test = extract(os.path.join(path_folder, test_file_name))
|
54 |
+
train_set = datasets.Dataset.from_pandas(pd.DataFrame(data=zip(x_train,y_train), columns=['text','label']))
|
55 |
+
test_set = datasets.Dataset.from_pandas(pd.DataFrame(data=zip(x_test,y_test), columns=['text','label']))
|
56 |
+
custom_dt = datasets.DatasetDict({'train': train_set, 'test': test_set})
|
57 |
+
tokenizer = AutoTokenizer.from_pretrained('wonrax/phobert-base-vietnamese-sentiment', use_fast=False)
|
58 |
+
def tokenize(batch):
|
59 |
+
return tokenizer(list(batch['text']), padding=True, truncation=True)
|
60 |
+
custom_tokenized = custom_dt.map(tokenize, batched=True, batch_size=None)
|
61 |
+
custom_tokenized.set_format('torch',columns=["input_ids", 'token_type_ids', "attention_mask", "label"])
|
62 |
+
return custom_tokenized
|
63 |
+
|
64 |
+
# support function for ner task
|
65 |
+
def get_dict_map(data, mode="token"):
|
66 |
+
if mode == "token":
|
67 |
+
vocab = list(set([j[0] for i in data for j in i]))
|
68 |
+
else:
|
69 |
+
vocab = list(set([j[1] for i in data for j in i]))
|
70 |
+
idx2tok = {idx:tok for idx, tok in enumerate(vocab)}
|
71 |
+
tok2idx = {tok:idx for idx, tok in enumerate(vocab)}
|
72 |
+
return tok2idx, idx2tok
|
73 |
+
|
74 |
+
def read_csv_to_ner_data(path):
|
75 |
+
data = pd.read_csv(path, encoding="utf-8")
|
76 |
+
tok = list(data["token"])
|
77 |
+
tok = [replace_all(i) for i in tok]
|
78 |
+
lab = list(data["label"])
|
79 |
+
token = []
|
80 |
+
label = []
|
81 |
+
tmp = []
|
82 |
+
tmp_ = []
|
83 |
+
for i, txt in enumerate(tok):
|
84 |
+
if str(txt) != "nan":
|
85 |
+
tmp.append(txt)
|
86 |
+
tmp_.append(lab[i])
|
87 |
+
else:
|
88 |
+
token.append(tmp)
|
89 |
+
label.append(tmp_)
|
90 |
+
tmp = []
|
91 |
+
tmp_ = []
|
92 |
+
|
93 |
+
data = []
|
94 |
+
tmp = []
|
95 |
+
for i, sent in enumerate(token):
|
96 |
+
for j, tok in enumerate(sent):
|
97 |
+
tmp.append([tok, label[i][j]])
|
98 |
+
data.append(tmp)
|
99 |
+
tmp = []
|
100 |
+
return data
|
101 |
+
|
102 |
+
# get feature for ner task
|
103 |
+
def feature_for_phobert(data, tokenizer, max_seq_len: int=256, use_crf: bool = False) -> List[NerFeatures]:
|
104 |
+
features = []
|
105 |
+
tokens = []
|
106 |
+
tag_ids = []
|
107 |
+
# args = parse_arguments()
|
108 |
+
path = os.path.abspath("./data/topic")
|
109 |
+
file_name = os.listdir(path)[0]
|
110 |
+
df = read_csv_to_ner_data(os.path.join(path, file_name))
|
111 |
+
tag2idx, idx2tag = get_dict_map(df, 'tag')
|
112 |
+
for id, tokens in enumerate(data):
|
113 |
+
if tokens == []:
|
114 |
+
continue
|
115 |
+
tag_ids = [tag2idx[i[1]] for i in tokens]
|
116 |
+
seq_len = len(tokens)
|
117 |
+
sentence = ' '.join([tok[0] for tok in tokens])
|
118 |
+
encoding = tokenizer(sentence, padding='max_length', truncation=True, max_length=max_seq_len)
|
119 |
+
subwords = tokenizer.tokenize(sentence)
|
120 |
+
valid_ids = np.zeros(len(encoding.input_ids), dtype=int)
|
121 |
+
label_marks = np.zeros(len(encoding.input_ids), dtype=int)
|
122 |
+
valid_labels = np.ones(len(encoding.input_ids), dtype=int) * -100
|
123 |
+
i = 1
|
124 |
+
for idx, subword in enumerate(subwords): # subwords[:max_seq_len-2]
|
125 |
+
if idx != 0 and subwords[idx-1].endswith("@@"):
|
126 |
+
continue
|
127 |
+
if use_crf:
|
128 |
+
valid_ids[i-1] = idx + 1
|
129 |
+
else:
|
130 |
+
valid_ids[idx+1] = 1
|
131 |
+
valid_labels[idx+1] = tag_ids[i-1]
|
132 |
+
i += 1
|
133 |
+
if max_seq_len >= seq_len:
|
134 |
+
label_padding_size = (max_seq_len - seq_len)
|
135 |
+
label_marks[:seq_len] = [1] * seq_len
|
136 |
+
tag_ids.extend([0] * label_padding_size)
|
137 |
+
else:
|
138 |
+
tag_ids = tag_ids[:max_seq_len]
|
139 |
+
label_marks[:-2] = [1] * (max_seq_len - 2)
|
140 |
+
tag_ids[-2:] = [0] * 2
|
141 |
+
if use_crf and label_marks[0] == 0:
|
142 |
+
try:
|
143 |
+
raise f"{sentence} - {tag_ids} have mark == 0 at index 0!"
|
144 |
+
except:
|
145 |
+
print(f"{sentence} - {tag_ids} have mark == 0 at index 0!")
|
146 |
+
break
|
147 |
+
items = {key: val for key, val in encoding.items()}
|
148 |
+
items['labels'] = tag_ids if use_crf else valid_labels
|
149 |
+
items['valid_ids'] = valid_ids
|
150 |
+
items['label_masks'] = label_marks if use_crf else valid_ids
|
151 |
+
features.append(NerFeatures(**items))
|
152 |
+
for k, v in items.items():
|
153 |
+
assert len(v) == max_seq_len, f"Expected length of {k} is {max_seq_len} but got {len(v)}"
|
154 |
+
tokens = []
|
155 |
+
tag_ids = []
|
156 |
+
return features
|
157 |
+
|
158 |
+
# create ner dataset
|
159 |
+
def topic_dataset(path_folder, file_name, tokenizer, use_crf=True):
|
160 |
+
data = read_csv_to_ner_data(os.path.join(path_folder, file_name))
|
161 |
+
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
|
162 |
+
# token2idx, idx2token = get_dict_map(train_data+test_data, 'token')
|
163 |
+
#tag2idx, idx2tag = get_dict_map(data, 'tag')
|
164 |
+
|
165 |
+
train_set = NerDataset(feature_for_phobert(train_data, tokenizer=tokenizer, use_crf=use_crf))
|
166 |
+
test_set = NerDataset(feature_for_phobert(test_data, tokenizer=tokenizer, use_crf=use_crf))
|
167 |
+
return train_set, test_set
|
168 |
+
|
169 |
+
|