Update all_datasets.py
Browse files- all_datasets.py +3 -43
all_datasets.py
CHANGED
@@ -61,54 +61,14 @@ def sentiment_dataset(path_folder, train_file_name, test_file_name):
|
|
61 |
custom_tokenized.set_format('torch',columns=["input_ids", 'token_type_ids', "attention_mask", "label"])
|
62 |
return custom_tokenized
|
63 |
|
64 |
-
# support function for ner task
|
65 |
-
def get_dict_map(data, mode="token"):
|
66 |
-
if mode == "token":
|
67 |
-
vocab = list(set([j[0] for i in data for j in i]))
|
68 |
-
else:
|
69 |
-
vocab = list(set([j[1] for i in data for j in i]))
|
70 |
-
idx2tok = {idx:tok for idx, tok in enumerate(vocab)}
|
71 |
-
tok2idx = {tok:idx for idx, tok in enumerate(vocab)}
|
72 |
-
return tok2idx, idx2tok
|
73 |
-
|
74 |
-
def read_csv_to_ner_data(path):
|
75 |
-
data = pd.read_csv(path, encoding="utf-8")
|
76 |
-
tok = list(data["token"])
|
77 |
-
tok = [replace_all(i) for i in tok]
|
78 |
-
lab = list(data["label"])
|
79 |
-
token = []
|
80 |
-
label = []
|
81 |
-
tmp = []
|
82 |
-
tmp_ = []
|
83 |
-
for i, txt in enumerate(tok):
|
84 |
-
if str(txt) != "nan":
|
85 |
-
tmp.append(txt)
|
86 |
-
tmp_.append(lab[i])
|
87 |
-
else:
|
88 |
-
token.append(tmp)
|
89 |
-
label.append(tmp_)
|
90 |
-
tmp = []
|
91 |
-
tmp_ = []
|
92 |
-
|
93 |
-
data = []
|
94 |
-
tmp = []
|
95 |
-
for i, sent in enumerate(token):
|
96 |
-
for j, tok in enumerate(sent):
|
97 |
-
tmp.append([tok, label[i][j]])
|
98 |
-
data.append(tmp)
|
99 |
-
tmp = []
|
100 |
-
return data
|
101 |
-
|
102 |
# get feature for ner task
|
103 |
def feature_for_phobert(data, tokenizer, max_seq_len: int=256, use_crf: bool = False) -> List[NerFeatures]:
|
104 |
features = []
|
105 |
tokens = []
|
106 |
tag_ids = []
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
df = read_csv_to_ner_data(os.path.join(path, file_name))
|
111 |
-
tag2idx, idx2tag = get_dict_map(df, 'tag')
|
112 |
for id, tokens in enumerate(data):
|
113 |
if tokens == []:
|
114 |
continue
|
|
|
61 |
custom_tokenized.set_format('torch',columns=["input_ids", 'token_type_ids', "attention_mask", "label"])
|
62 |
return custom_tokenized
|
63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
# get feature for ner task
|
65 |
def feature_for_phobert(data, tokenizer, max_seq_len: int=256, use_crf: bool = False) -> List[NerFeatures]:
|
66 |
features = []
|
67 |
tokens = []
|
68 |
tag_ids = []
|
69 |
+
|
70 |
+
idx2tag = {0: 'B-chỗ để xe', 1: 'B-con người', 2: 'B-công việc', 3: 'B-cơ sở vật chất', 4: 'B-dự án', 5: 'B-lương', 6: 'B-môi trường làm việc', 7: 'B-ot/thời gian', 8: 'B-văn phòng', 9: 'B-đãi ngộ', 10: 'I-chỗ để xe', 11: 'I-con người', 12: 'I-công việc', 13: 'I-cơ sở vật chất', 14: 'I-dự án', 15: 'I-lương', 16: 'I-môi trường làm việc', 17: 'I-ot/thời gian', 18: 'I-văn phòng', 19: 'I-đãi ngộ', 20: 'O'}
|
71 |
+
tag2idx = {v: k for k, v in idx2tag.items()}
|
|
|
|
|
72 |
for id, tokens in enumerate(data):
|
73 |
if tokens == []:
|
74 |
continue
|