nam194's picture
Update all_datasets.py
1cafba1
from imports import *
from utils import normalize, replace_all
class NerFeatures(object):
def __init__(self, input_ids, token_type_ids, attention_mask, valid_ids, labels, label_masks):
self.input_ids = torch.as_tensor(input_ids, dtype=torch.long)
self.labels = torch.as_tensor(labels, dtype=torch.long)
self.token_type_ids = torch.as_tensor(token_type_ids, dtype=torch.long)
self.attention_mask = torch.as_tensor(attention_mask, dtype=torch.long)
self.valid_ids = torch.as_tensor(valid_ids, dtype=torch.long)
self.label_masks = torch.as_tensor(label_masks, dtype=torch.long)
class NerOutput(OrderedDict):
loss: Optional[torch.FloatTensor] = torch.FloatTensor([0.0])
tags: Optional[List[int]] = []
cls_metrics: Optional[List[int]] = []
def __getitem__(self, k):
if isinstance(k, str):
inner_dict = {k: v for (k, v) in self.items()}
return inner_dict[k]
else:
return self.to_tuple()[k]
def __setattr__(self, name, value):
if name in self.keys() and value is not None:
super().__setitem__(name, value)
super().__setattr__(name, value)
def __setitem__(self, key, value):
super().__setitem__(key, value)
super().__setattr__(key, value)
def to_tuple(self) -> Tuple[Any]:
return tuple(self[k] for k in self.keys())
class NerDataset(Dataset):
def __init__(self, features: List[NerFeatures], device: str = 'cpu'):
self.examples = features
self.device = device
def __len__(self):
return len(self.examples)
def __getitem__(self, index):
return {key: val.to(self.device) for key, val in self.examples[index].__dict__.items()}
# return sentiment dataset at tensor type
def sentiment_dataset(path_folder, train_file_name, test_file_name):
def extract(path):
data = pd.read_csv(os.path.join(path), encoding="utf-8").dropna()
label = [np.argmax(i) for i in data[["negative", "positive", "neutral"]].values.astype(float)]
# text = data["text"].apply(lambda x: x.replace("_"," "))
text = data["text"]#.apply(lambda x: normalize(x))
return text, label
x_train, y_train = extract(os.path.join(path_folder, train_file_name))
x_test, y_test = extract(os.path.join(path_folder, test_file_name))
train_set = datasets.Dataset.from_pandas(pd.DataFrame(data=zip(x_train,y_train), columns=['text','label']))
test_set = datasets.Dataset.from_pandas(pd.DataFrame(data=zip(x_test,y_test), columns=['text','label']))
custom_dt = datasets.DatasetDict({'train': train_set, 'test': test_set})
tokenizer = AutoTokenizer.from_pretrained('wonrax/phobert-base-vietnamese-sentiment', use_fast=False)
def tokenize(batch):
return tokenizer(list(batch['text']), padding=True, truncation=True)
custom_tokenized = custom_dt.map(tokenize, batched=True, batch_size=None)
custom_tokenized.set_format('torch',columns=["input_ids", 'token_type_ids', "attention_mask", "label"])
return custom_tokenized
# get feature for ner task
def feature_for_phobert(data, tokenizer, max_seq_len: int=256, use_crf: bool = False) -> List[NerFeatures]:
features = []
tokens = []
tag_ids = []
idx2tag = {0: 'B-chỗ để xe', 1: 'B-con người', 2: 'B-công việc', 3: 'B-cơ sở vật chất', 4: 'B-dự án', 5: 'B-lương', 6: 'B-môi trường làm việc', 7: 'B-ot/thời gian', 8: 'B-văn phòng', 9: 'B-đãi ngộ', 10: 'I-chỗ để xe', 11: 'I-con người', 12: 'I-công việc', 13: 'I-cơ sở vật chất', 14: 'I-dự án', 15: 'I-lương', 16: 'I-môi trường làm việc', 17: 'I-ot/thời gian', 18: 'I-văn phòng', 19: 'I-đãi ngộ', 20: 'O'}
tag2idx = {v: k for k, v in idx2tag.items()}
for id, tokens in enumerate(data):
if tokens == []:
continue
tag_ids = [tag2idx[i[1]] for i in tokens]
seq_len = len(tokens)
sentence = ' '.join([tok[0] for tok in tokens])
encoding = tokenizer(sentence, padding='max_length', truncation=True, max_length=max_seq_len)
subwords = tokenizer.tokenize(sentence)
valid_ids = np.zeros(len(encoding.input_ids), dtype=int)
label_marks = np.zeros(len(encoding.input_ids), dtype=int)
valid_labels = np.ones(len(encoding.input_ids), dtype=int) * -100
i = 1
for idx, subword in enumerate(subwords): # subwords[:max_seq_len-2]
if idx != 0 and subwords[idx-1].endswith("@@"):
continue
if use_crf:
valid_ids[i-1] = idx + 1
else:
valid_ids[idx+1] = 1
valid_labels[idx+1] = tag_ids[i-1]
i += 1
if max_seq_len >= seq_len:
label_padding_size = (max_seq_len - seq_len)
label_marks[:seq_len] = [1] * seq_len
tag_ids.extend([0] * label_padding_size)
else:
tag_ids = tag_ids[:max_seq_len]
label_marks[:-2] = [1] * (max_seq_len - 2)
tag_ids[-2:] = [0] * 2
if use_crf and label_marks[0] == 0:
try:
raise f"{sentence} - {tag_ids} have mark == 0 at index 0!"
except:
print(f"{sentence} - {tag_ids} have mark == 0 at index 0!")
break
items = {key: val for key, val in encoding.items()}
items['labels'] = tag_ids if use_crf else valid_labels
items['valid_ids'] = valid_ids
items['label_masks'] = label_marks if use_crf else valid_ids
features.append(NerFeatures(**items))
for k, v in items.items():
assert len(v) == max_seq_len, f"Expected length of {k} is {max_seq_len} but got {len(v)}"
tokens = []
tag_ids = []
return features
# create ner dataset
def topic_dataset(path_folder, file_name, tokenizer, use_crf=True):
data = read_csv_to_ner_data(os.path.join(path_folder, file_name))
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
# token2idx, idx2token = get_dict_map(train_data+test_data, 'token')
#tag2idx, idx2tag = get_dict_map(data, 'tag')
train_set = NerDataset(feature_for_phobert(train_data, tokenizer=tokenizer, use_crf=use_crf))
test_set = NerDataset(feature_for_phobert(test_data, tokenizer=tokenizer, use_crf=use_crf))
return train_set, test_set