|
from imports import * |
|
from utils import normalize, replace_all |
|
|
|
class NerFeatures(object): |
|
def __init__(self, input_ids, token_type_ids, attention_mask, valid_ids, labels, label_masks): |
|
self.input_ids = torch.as_tensor(input_ids, dtype=torch.long) |
|
self.labels = torch.as_tensor(labels, dtype=torch.long) |
|
self.token_type_ids = torch.as_tensor(token_type_ids, dtype=torch.long) |
|
self.attention_mask = torch.as_tensor(attention_mask, dtype=torch.long) |
|
self.valid_ids = torch.as_tensor(valid_ids, dtype=torch.long) |
|
self.label_masks = torch.as_tensor(label_masks, dtype=torch.long) |
|
|
|
class NerOutput(OrderedDict): |
|
loss: Optional[torch.FloatTensor] = torch.FloatTensor([0.0]) |
|
tags: Optional[List[int]] = [] |
|
cls_metrics: Optional[List[int]] = [] |
|
def __getitem__(self, k): |
|
if isinstance(k, str): |
|
inner_dict = {k: v for (k, v) in self.items()} |
|
return inner_dict[k] |
|
else: |
|
return self.to_tuple()[k] |
|
def __setattr__(self, name, value): |
|
if name in self.keys() and value is not None: |
|
super().__setitem__(name, value) |
|
super().__setattr__(name, value) |
|
def __setitem__(self, key, value): |
|
super().__setitem__(key, value) |
|
super().__setattr__(key, value) |
|
def to_tuple(self) -> Tuple[Any]: |
|
return tuple(self[k] for k in self.keys()) |
|
|
|
class NerDataset(Dataset): |
|
def __init__(self, features: List[NerFeatures], device: str = 'cpu'): |
|
self.examples = features |
|
self.device = device |
|
|
|
def __len__(self): |
|
return len(self.examples) |
|
|
|
def __getitem__(self, index): |
|
return {key: val.to(self.device) for key, val in self.examples[index].__dict__.items()} |
|
|
|
|
|
def sentiment_dataset(path_folder, train_file_name, test_file_name): |
|
def extract(path): |
|
data = pd.read_csv(os.path.join(path), encoding="utf-8").dropna() |
|
label = [np.argmax(i) for i in data[["negative", "positive", "neutral"]].values.astype(float)] |
|
|
|
text = data["text"] |
|
return text, label |
|
x_train, y_train = extract(os.path.join(path_folder, train_file_name)) |
|
x_test, y_test = extract(os.path.join(path_folder, test_file_name)) |
|
train_set = datasets.Dataset.from_pandas(pd.DataFrame(data=zip(x_train,y_train), columns=['text','label'])) |
|
test_set = datasets.Dataset.from_pandas(pd.DataFrame(data=zip(x_test,y_test), columns=['text','label'])) |
|
custom_dt = datasets.DatasetDict({'train': train_set, 'test': test_set}) |
|
tokenizer = AutoTokenizer.from_pretrained('wonrax/phobert-base-vietnamese-sentiment', use_fast=False) |
|
def tokenize(batch): |
|
return tokenizer(list(batch['text']), padding=True, truncation=True) |
|
custom_tokenized = custom_dt.map(tokenize, batched=True, batch_size=None) |
|
custom_tokenized.set_format('torch',columns=["input_ids", 'token_type_ids', "attention_mask", "label"]) |
|
return custom_tokenized |
|
|
|
|
|
def feature_for_phobert(data, tokenizer, max_seq_len: int=256, use_crf: bool = False) -> List[NerFeatures]: |
|
features = [] |
|
tokens = [] |
|
tag_ids = [] |
|
|
|
idx2tag = {0: 'B-chỗ để xe', 1: 'B-con người', 2: 'B-công việc', 3: 'B-cơ sở vật chất', 4: 'B-dự án', 5: 'B-lương', 6: 'B-môi trường làm việc', 7: 'B-ot/thời gian', 8: 'B-văn phòng', 9: 'B-đãi ngộ', 10: 'I-chỗ để xe', 11: 'I-con người', 12: 'I-công việc', 13: 'I-cơ sở vật chất', 14: 'I-dự án', 15: 'I-lương', 16: 'I-môi trường làm việc', 17: 'I-ot/thời gian', 18: 'I-văn phòng', 19: 'I-đãi ngộ', 20: 'O'} |
|
tag2idx = {v: k for k, v in idx2tag.items()} |
|
for id, tokens in enumerate(data): |
|
if tokens == []: |
|
continue |
|
tag_ids = [tag2idx[i[1]] for i in tokens] |
|
seq_len = len(tokens) |
|
sentence = ' '.join([tok[0] for tok in tokens]) |
|
encoding = tokenizer(sentence, padding='max_length', truncation=True, max_length=max_seq_len) |
|
subwords = tokenizer.tokenize(sentence) |
|
valid_ids = np.zeros(len(encoding.input_ids), dtype=int) |
|
label_marks = np.zeros(len(encoding.input_ids), dtype=int) |
|
valid_labels = np.ones(len(encoding.input_ids), dtype=int) * -100 |
|
i = 1 |
|
for idx, subword in enumerate(subwords): |
|
if idx != 0 and subwords[idx-1].endswith("@@"): |
|
continue |
|
if use_crf: |
|
valid_ids[i-1] = idx + 1 |
|
else: |
|
valid_ids[idx+1] = 1 |
|
valid_labels[idx+1] = tag_ids[i-1] |
|
i += 1 |
|
if max_seq_len >= seq_len: |
|
label_padding_size = (max_seq_len - seq_len) |
|
label_marks[:seq_len] = [1] * seq_len |
|
tag_ids.extend([0] * label_padding_size) |
|
else: |
|
tag_ids = tag_ids[:max_seq_len] |
|
label_marks[:-2] = [1] * (max_seq_len - 2) |
|
tag_ids[-2:] = [0] * 2 |
|
if use_crf and label_marks[0] == 0: |
|
try: |
|
raise f"{sentence} - {tag_ids} have mark == 0 at index 0!" |
|
except: |
|
print(f"{sentence} - {tag_ids} have mark == 0 at index 0!") |
|
break |
|
items = {key: val for key, val in encoding.items()} |
|
items['labels'] = tag_ids if use_crf else valid_labels |
|
items['valid_ids'] = valid_ids |
|
items['label_masks'] = label_marks if use_crf else valid_ids |
|
features.append(NerFeatures(**items)) |
|
for k, v in items.items(): |
|
assert len(v) == max_seq_len, f"Expected length of {k} is {max_seq_len} but got {len(v)}" |
|
tokens = [] |
|
tag_ids = [] |
|
return features |
|
|
|
|
|
def topic_dataset(path_folder, file_name, tokenizer, use_crf=True): |
|
data = read_csv_to_ner_data(os.path.join(path_folder, file_name)) |
|
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42) |
|
|
|
|
|
|
|
train_set = NerDataset(feature_for_phobert(train_data, tokenizer=tokenizer, use_crf=use_crf)) |
|
test_set = NerDataset(feature_for_phobert(test_data, tokenizer=tokenizer, use_crf=use_crf)) |
|
return train_set, test_set |
|
|
|
|
|
|