nam194 commited on
Commit
8824528
1 Parent(s): 632f4d8

Create all_datasets.py

Browse files
Files changed (1) hide show
  1. all_datasets.py +169 -0
all_datasets.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from imports import *
2
+ from utils import normalize, replace_all
3
+
4
+ class NerFeatures(object):
5
+ def __init__(self, input_ids, token_type_ids, attention_mask, valid_ids, labels, label_masks):
6
+ self.input_ids = torch.as_tensor(input_ids, dtype=torch.long)
7
+ self.labels = torch.as_tensor(labels, dtype=torch.long)
8
+ self.token_type_ids = torch.as_tensor(token_type_ids, dtype=torch.long)
9
+ self.attention_mask = torch.as_tensor(attention_mask, dtype=torch.long)
10
+ self.valid_ids = torch.as_tensor(valid_ids, dtype=torch.long)
11
+ self.label_masks = torch.as_tensor(label_masks, dtype=torch.long)
12
+
13
+ class NerOutput(OrderedDict):
14
+ loss: Optional[torch.FloatTensor] = torch.FloatTensor([0.0])
15
+ tags: Optional[List[int]] = []
16
+ cls_metrics: Optional[List[int]] = []
17
+ def __getitem__(self, k):
18
+ if isinstance(k, str):
19
+ inner_dict = {k: v for (k, v) in self.items()}
20
+ return inner_dict[k]
21
+ else:
22
+ return self.to_tuple()[k]
23
+ def __setattr__(self, name, value):
24
+ if name in self.keys() and value is not None:
25
+ super().__setitem__(name, value)
26
+ super().__setattr__(name, value)
27
+ def __setitem__(self, key, value):
28
+ super().__setitem__(key, value)
29
+ super().__setattr__(key, value)
30
+ def to_tuple(self) -> Tuple[Any]:
31
+ return tuple(self[k] for k in self.keys())
32
+
33
+ class NerDataset(Dataset):
34
+ def __init__(self, features: List[NerFeatures], device: str = 'cpu'):
35
+ self.examples = features
36
+ self.device = device
37
+
38
+ def __len__(self):
39
+ return len(self.examples)
40
+
41
+ def __getitem__(self, index):
42
+ return {key: val.to(self.device) for key, val in self.examples[index].__dict__.items()}
43
+
44
+ # return sentiment dataset at tensor type
45
+ def sentiment_dataset(path_folder, train_file_name, test_file_name):
46
+ def extract(path):
47
+ data = pd.read_csv(os.path.join(path), encoding="utf-8").dropna()
48
+ label = [np.argmax(i) for i in data[["negative", "positive", "neutral"]].values.astype(float)]
49
+ # text = data["text"].apply(lambda x: x.replace("_"," "))
50
+ text = data["text"]#.apply(lambda x: normalize(x))
51
+ return text, label
52
+ x_train, y_train = extract(os.path.join(path_folder, train_file_name))
53
+ x_test, y_test = extract(os.path.join(path_folder, test_file_name))
54
+ train_set = datasets.Dataset.from_pandas(pd.DataFrame(data=zip(x_train,y_train), columns=['text','label']))
55
+ test_set = datasets.Dataset.from_pandas(pd.DataFrame(data=zip(x_test,y_test), columns=['text','label']))
56
+ custom_dt = datasets.DatasetDict({'train': train_set, 'test': test_set})
57
+ tokenizer = AutoTokenizer.from_pretrained('wonrax/phobert-base-vietnamese-sentiment', use_fast=False)
58
+ def tokenize(batch):
59
+ return tokenizer(list(batch['text']), padding=True, truncation=True)
60
+ custom_tokenized = custom_dt.map(tokenize, batched=True, batch_size=None)
61
+ custom_tokenized.set_format('torch',columns=["input_ids", 'token_type_ids', "attention_mask", "label"])
62
+ return custom_tokenized
63
+
64
+ # support function for ner task
65
+ def get_dict_map(data, mode="token"):
66
+ if mode == "token":
67
+ vocab = list(set([j[0] for i in data for j in i]))
68
+ else:
69
+ vocab = list(set([j[1] for i in data for j in i]))
70
+ idx2tok = {idx:tok for idx, tok in enumerate(vocab)}
71
+ tok2idx = {tok:idx for idx, tok in enumerate(vocab)}
72
+ return tok2idx, idx2tok
73
+
74
+ def read_csv_to_ner_data(path):
75
+ data = pd.read_csv(path, encoding="utf-8")
76
+ tok = list(data["token"])
77
+ tok = [replace_all(i) for i in tok]
78
+ lab = list(data["label"])
79
+ token = []
80
+ label = []
81
+ tmp = []
82
+ tmp_ = []
83
+ for i, txt in enumerate(tok):
84
+ if str(txt) != "nan":
85
+ tmp.append(txt)
86
+ tmp_.append(lab[i])
87
+ else:
88
+ token.append(tmp)
89
+ label.append(tmp_)
90
+ tmp = []
91
+ tmp_ = []
92
+
93
+ data = []
94
+ tmp = []
95
+ for i, sent in enumerate(token):
96
+ for j, tok in enumerate(sent):
97
+ tmp.append([tok, label[i][j]])
98
+ data.append(tmp)
99
+ tmp = []
100
+ return data
101
+
102
+ # get feature for ner task
103
+ def feature_for_phobert(data, tokenizer, max_seq_len: int=256, use_crf: bool = False) -> List[NerFeatures]:
104
+ features = []
105
+ tokens = []
106
+ tag_ids = []
107
+ # args = parse_arguments()
108
+ path = os.path.abspath("./data/topic")
109
+ file_name = os.listdir(path)[0]
110
+ df = read_csv_to_ner_data(os.path.join(path, file_name))
111
+ tag2idx, idx2tag = get_dict_map(df, 'tag')
112
+ for id, tokens in enumerate(data):
113
+ if tokens == []:
114
+ continue
115
+ tag_ids = [tag2idx[i[1]] for i in tokens]
116
+ seq_len = len(tokens)
117
+ sentence = ' '.join([tok[0] for tok in tokens])
118
+ encoding = tokenizer(sentence, padding='max_length', truncation=True, max_length=max_seq_len)
119
+ subwords = tokenizer.tokenize(sentence)
120
+ valid_ids = np.zeros(len(encoding.input_ids), dtype=int)
121
+ label_marks = np.zeros(len(encoding.input_ids), dtype=int)
122
+ valid_labels = np.ones(len(encoding.input_ids), dtype=int) * -100
123
+ i = 1
124
+ for idx, subword in enumerate(subwords): # subwords[:max_seq_len-2]
125
+ if idx != 0 and subwords[idx-1].endswith("@@"):
126
+ continue
127
+ if use_crf:
128
+ valid_ids[i-1] = idx + 1
129
+ else:
130
+ valid_ids[idx+1] = 1
131
+ valid_labels[idx+1] = tag_ids[i-1]
132
+ i += 1
133
+ if max_seq_len >= seq_len:
134
+ label_padding_size = (max_seq_len - seq_len)
135
+ label_marks[:seq_len] = [1] * seq_len
136
+ tag_ids.extend([0] * label_padding_size)
137
+ else:
138
+ tag_ids = tag_ids[:max_seq_len]
139
+ label_marks[:-2] = [1] * (max_seq_len - 2)
140
+ tag_ids[-2:] = [0] * 2
141
+ if use_crf and label_marks[0] == 0:
142
+ try:
143
+ raise f"{sentence} - {tag_ids} have mark == 0 at index 0!"
144
+ except:
145
+ print(f"{sentence} - {tag_ids} have mark == 0 at index 0!")
146
+ break
147
+ items = {key: val for key, val in encoding.items()}
148
+ items['labels'] = tag_ids if use_crf else valid_labels
149
+ items['valid_ids'] = valid_ids
150
+ items['label_masks'] = label_marks if use_crf else valid_ids
151
+ features.append(NerFeatures(**items))
152
+ for k, v in items.items():
153
+ assert len(v) == max_seq_len, f"Expected length of {k} is {max_seq_len} but got {len(v)}"
154
+ tokens = []
155
+ tag_ids = []
156
+ return features
157
+
158
+ # create ner dataset
159
+ def topic_dataset(path_folder, file_name, tokenizer, use_crf=True):
160
+ data = read_csv_to_ner_data(os.path.join(path_folder, file_name))
161
+ train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
162
+ # token2idx, idx2token = get_dict_map(train_data+test_data, 'token')
163
+ #tag2idx, idx2tag = get_dict_map(data, 'tag')
164
+
165
+ train_set = NerDataset(feature_for_phobert(train_data, tokenizer=tokenizer, use_crf=use_crf))
166
+ test_set = NerDataset(feature_for_phobert(test_data, tokenizer=tokenizer, use_crf=use_crf))
167
+ return train_set, test_set
168
+
169
+