|
import torch |
|
from typing import Dict |
|
from transformers import PreTrainedTokenizer |
|
from tokenizers.implementations import CharBPETokenizer |
|
from tokenizers.processors import TemplateProcessing |
|
import regex as re |
|
|
|
from typing import Tuple, Optional |
|
import shutil |
|
import os |
|
import requests |
|
|
|
class VnSmartphoneAbsaTokenizer(PreTrainedTokenizer): |
|
vocab_files_names = { |
|
"vocab_file": "vocab.txt", |
|
"merge_file": "merge.txt", |
|
} |
|
pretrained_vocab_files_map = { |
|
"vocab_file": "https://huggingface.co/ptdat/vn-smartphone-absa/resolve/main/vocab.txt", |
|
"merge_file": "https://huggingface.co/ptdat/vn-smartphone-absa/resolve/main/merge.txt" |
|
} |
|
model_input_names = ["input_ids", "attention_mask"] |
|
|
|
def __init__( |
|
self, |
|
vocab_file, |
|
merge_file, |
|
bos_token="<s>", |
|
eos_token="</s>", |
|
sep_token="</s>", |
|
cls_token="<s>", |
|
unk_token="<unk>", |
|
pad_token="<pad>", |
|
mask_token="<mask>", |
|
**kwargs |
|
): |
|
self.vocab_file = vocab_file |
|
self.merge_file = merge_file |
|
|
|
self.tokenizer = CharBPETokenizer(vocab_file, merge_file, lowercase=True, bert_normalizer=False, split_on_whitespace_only=True) |
|
self.tokenizer.post_processor = TemplateProcessing( |
|
single="<s> $9 </s>", |
|
pair="<s> $A </s> $B:1 </s>:1", |
|
special_tokens=[ |
|
("<s>", 2), |
|
("</s>", 3) |
|
] |
|
) |
|
self.tokenizer.enable_padding(pad_token="<pad>") |
|
|
|
self.encoder = self.tokenizer.get_vocab() |
|
self.decoder = {v: k for k, v in self.encoder.items()} |
|
|
|
self.prepare_preprocess() |
|
|
|
super().__init__( |
|
bos_token=bos_token, |
|
eos_token=eos_token, |
|
sep_token=sep_token, |
|
cls_token=cls_token, |
|
unk_token=unk_token, |
|
pad_token=pad_token, |
|
mask_token=mask_token, |
|
**kwargs |
|
) |
|
|
|
def _tokenize(self, text: str): |
|
text = self.normalize(text) |
|
return self.tokenizer.encode(text).tokens |
|
|
|
def get_vocab(self) -> Dict[str, int]: |
|
return self.tokenizer.get_vocab() |
|
|
|
@property |
|
def vocab_size(self): |
|
return self.tokenizer.get_vocab_size() |
|
|
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: |
|
if not os.path.isdir(save_directory): |
|
return |
|
out_vocab_file = os.path.join( |
|
save_directory, (filename_prefix + "-" if filename_prefix else "") + "vocab.txt" |
|
) |
|
out_merge_file = os.path.join( |
|
save_directory, (filename_prefix + "-" if filename_prefix else "") + "merge.txt" |
|
) |
|
|
|
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): |
|
shutil.copyfile(self.vocab_file, out_vocab_file) |
|
elif not os.path.isfile(self.vocab_file): |
|
with open(out_vocab_file, "wb") as fi: |
|
content_spiece_model = self.sp_model.serialized_model_proto() |
|
fi.write(content_spiece_model) |
|
|
|
if os.path.abspath(self.merge_file) != os.path.abspath(out_merge_file): |
|
shutil.copyfile(self.merge_file, out_merge_file) |
|
|
|
return out_vocab_file, out_merge_file |
|
|
|
def _convert_token_to_id(self, token: str): |
|
return self.encoder.get(token, self.encoder[self.unk_token]) |
|
|
|
def _convert_id_to_token(self, id: int): |
|
return self.decoder.get(id, self.unk_token) |
|
|
|
def prepare_preprocess(self): |
|
self.uniChars = "àáảãạâầấẩẫậăằắẳẵặèéẻẽẹêềếểễệđìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵÀÁẢÃẠÂẦẤẨẪẬĂẰẮẲẴẶÈÉẺẼẸÊỀẾỂỄỆĐÌÍỈĨỊÒÓỎÕỌÔỒỐỔỖỘƠỜỚỞỠỢÙÚỦŨỤƯỪỨỬỮỰỲÝỶỸỴÂĂĐÔƠƯ" |
|
self.unsignChars = "aaaaaaaaaaaaaaaaaeeeeeeeeeeediiiiiooooooooooooooooouuuuuuuuuuuyyyyyAAAAAAAAAAAAAAAAAEEEEEEEEEEEDIIIOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUYYYYYAADOOU" |
|
|
|
self.dict_char = {} |
|
char1252 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'.split( |
|
'|') |
|
charutf8 = "à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ".split( |
|
'|') |
|
for i in range(len(char1252)): |
|
self.dict_char[char1252[i]] = charutf8[i] |
|
|
|
self.bang_nguyen_am = [['a', 'à', 'á', 'ả', 'ã', 'ạ', 'a'], |
|
['ă', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ', 'aw'], |
|
['â', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ', 'aa'], |
|
['e', 'è', 'é', 'ẻ', 'ẽ', 'ẹ', 'e'], |
|
['ê', 'ề', 'ế', 'ể', 'ễ', 'ệ', 'ee'], |
|
['i', 'ì', 'í', 'ỉ', 'ĩ', 'ị', 'i'], |
|
['o', 'ò', 'ó', 'ỏ', 'õ', 'ọ', 'o'], |
|
['ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ', 'oo'], |
|
['ơ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ', 'ow'], |
|
['u', 'ù', 'ú', 'ủ', 'ũ', 'ụ', 'u'], |
|
['ư', 'ừ', 'ứ', 'ử', 'ữ', 'ự', 'uw'], |
|
['y', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ', 'y']] |
|
self.bang_ky_tu_dau = ['', 'f', 's', 'r', 'x', 'j'] |
|
|
|
self.nguyen_am_to_ids = {} |
|
|
|
for i in range(len(self.bang_nguyen_am)): |
|
for j in range(len(self.bang_nguyen_am[i]) - 1): |
|
self.nguyen_am_to_ids[self.bang_nguyen_am[i][j]] = (i, j) |
|
|
|
self.sp_word_sub = { |
|
"@@": "confuseeyes", |
|
"℅": "%", |
|
r"/": " fraction ", |
|
r":\)+": "smileface", |
|
r";\)+": "smileface", |
|
r":\*+": "kissingface", |
|
r"=\)+": "playfulsmileface", |
|
r"=\(+": "playfulsadface", |
|
r":\(+": "sadface", |
|
r":3+": "threeface", |
|
r":v+": "vface", |
|
r"\^\^": "kindsmile", |
|
r"\^_\^": "kindmountsmile", |
|
r"\^\.\^": "kindmountsmile", |
|
r"-_-": "disapointface", |
|
r"\._\.": "confusedface", |
|
r":>+": "cutesmile", |
|
r"(\|)w(\|)": "fancycryface", |
|
r":\|": "mutedface", |
|
r":d+": "laughface", |
|
r"<3": "loveicon", |
|
r"\.{2,}": "threedot", |
|
r"-{1,}>{1,}": "arrow", |
|
r"={1,}>{1,}": "arrow", |
|
r"(\d+)h": r"\1 giờ", |
|
r"(\d+)'": r"\1 phút", |
|
r"(\d+)trieu": r"\1 triệu", |
|
r"(\d+)\s?tr": r"\1 triệu", |
|
r"blut\w+": "bluetooth", |
|
r"(\d+)\s\*": r"\1 sao" |
|
} |
|
|
|
self.replace_dict = { |
|
"/": "fraction", |
|
"wf": "wifi", |
|
"wifj": "wifi", |
|
"wjfj": "wifi", |
|
"wjfi": "wifi", |
|
"wiffi": "wifi", |
|
"wj": "wifi", |
|
"ko": "không", |
|
"k": "không", |
|
"hong": "không", |
|
"đc": "được", |
|
"sp": "sản phẩm", |
|
"fb": "facebook", |
|
"ytb": "youtube", |
|
"yt": "youtube", |
|
"mes": "messenger", |
|
"mess": "messenger", |
|
"tgdđ": "thegioididong", |
|
"nv": "nhân viên", |
|
"ss": "samsung", |
|
"ip": "iphone", |
|
"appel": "apple", |
|
"oke": "ok", |
|
"okie": "ok", |
|
"okey": "ok", |
|
"oki": "ok", |
|
"oce": "ok", |
|
"okela": "ok", |
|
"mk": "mình", |
|
"sd": "sử dụng", |
|
"sdung": "sử dụng", |
|
"ae": "anh em", |
|
"lq": "liên quân", |
|
"lqmb": "liên quân mobile", |
|
"lun": "luôn", |
|
"ng": "người", |
|
"ad": "admin", |
|
"ms": "mới", |
|
"cx": "cũng", |
|
"cũg": "cũng", |
|
"nhìu": "nhiều", |
|
"bth": "bình thường", |
|
"bthg": "bình thường", |
|
"ngta": "người ta", |
|
"dow": "download", |
|
"hdh": "hệ điều hành", |
|
"hđh": "hệ điều hành", |
|
"cammera": "camera", |
|
"dt": "điện thoại", |
|
"dthoai": "điện thoại", |
|
"dth": "điện thoại", |
|
"đth": "điện thoại", |
|
"hk": "không", |
|
"j": "gì", |
|
"ji": "gì", |
|
"mn": "mọi người", |
|
"m.n": "mọi người", |
|
"mjh": "mình", |
|
"mjk": "mình", |
|
"lắc": "lag", |
|
"lác": "lag", |
|
"lang": "lag", |
|
"nhah": "nhanh", |
|
"nóichung": "nói chung", |
|
"zl": "zalo", |
|
"sóg": "sóng", |
|
"rẽ": "rẻ", |
|
"trc": "trước", |
|
"chíp": "chip", |
|
"bin": "pin", |
|
"lm": "làm", |
|
"bik": "biết", |
|
"hog": "không", |
|
"zỏm": "dổm", |
|
"z": "vậy", |
|
"v": "vậy", |
|
"nhah": "nhanh", |
|
"r": "rồi", |
|
"ỗn": "ổn", |
|
"nhìu": "nhiều", |
|
"wá": "quá", |
|
"wep": "web", |
|
"wed": "web", |
|
"fim": "phim", |
|
"film": "phim", |
|
"xạc": "sạc", |
|
"xài": "sài", |
|
"het": "hết", |
|
"lun": "luôn", |
|
"e": "em", |
|
"a": "anh", |
|
"bjo": "bây giờ", |
|
"vl": "vãi lồn", |
|
"sac": "sạc", |
|
"vidieo": "video", |
|
"tét": "test", |
|
"tes": "test", |
|
"thik": "thích", |
|
"fai": "phải", |
|
"✋": "tay", |
|
"🔋": "pin", |
|
"☆": "sao", |
|
"supper": "super", |
|
"lổi": "lỗi", |
|
"loát": "load", |
|
"thui": "thôi", |
|
"rùi": "rồi", |
|
"ỗn": "ổn", |
|
"lổi": "lỗi", |
|
"suống": "xuống", |
|
"selfi": "selfie", |
|
"gg": "google", |
|
"cam on": "cảm ơn", |
|
"tg": "thời gian", |
|
"nchung": "nói chung", |
|
"❤": "loveicon", |
|
"trại nghiệm": "trải nghiệm", |
|
"dất": "rất", |
|
"đứg": "đứng", |
|
"bằg": "bằng", |
|
"mìh": "mình", |
|
"đag": "đang", |
|
"thoi": "thôi", |
|
"củng": "cũng", |
|
"đả": "đã", |
|
"màng": "màn", |
|
"ff": "free fire", |
|
"cod": "call of duty", |
|
"moi thứ": "mọi thứ", |
|
"moi thu": "mọi thứ", |
|
"moi thư": "mọi thứ", |
|
"moi người": "mọi người", |
|
"moi": "mới", |
|
"dk": "được", |
|
"đk": "được", |
|
"nhậy": "nhạy", |
|
"ak": "á", |
|
"ghe": "nghe", |
|
"bùn": "buồn", |
|
"bit": "biết", |
|
"bít": "biết", |
|
"bnhieu": "bao nhiêu", |
|
"dụg": "dụng", |
|
"tk": "tài khoản", |
|
"sąc": "sạc", |
|
"rât": "rât", |
|
"haz": "haiz", |
|
"sai làm": "sai lầm", |
|
"flim": "film", |
|
"xướt": "xước", |
|
"viềng": "viền" |
|
} |
|
|
|
def convert_unicode(self, text: str): |
|
return re.sub( |
|
r'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ', |
|
lambda x: self.dict_char[x.group()], text |
|
) |
|
|
|
def is_valid_vietnam_word(self, word): |
|
chars = list(word) |
|
nguyen_am_index = -1 |
|
for index, char in enumerate(chars): |
|
x, y = self.nguyen_am_to_ids.get(char, (-1, -1)) |
|
if x != -1: |
|
if nguyen_am_index == -1: |
|
nguyen_am_index = index |
|
else: |
|
if index - nguyen_am_index != 1: |
|
return False |
|
nguyen_am_index = index |
|
return True |
|
|
|
def chuan_hoa_dau_tu_tieng_viet(self, word): |
|
if not self.is_valid_vietnam_word(word): |
|
return word |
|
|
|
chars = list(word) |
|
dau_cau = 0 |
|
nguyen_am_index = [] |
|
qu_or_gi = False |
|
for index, char in enumerate(chars): |
|
x, y = self.nguyen_am_to_ids.get(char, (-1, -1)) |
|
if x == -1: |
|
continue |
|
elif x == 9: |
|
if index != 0 and chars[index - 1] == 'q': |
|
chars[index] = 'u' |
|
qu_or_gi = True |
|
elif x == 5: |
|
if index != 0 and chars[index - 1] == 'g': |
|
chars[index] = 'i' |
|
qu_or_gi = True |
|
if y != 0: |
|
dau_cau = y |
|
chars[index] = self.bang_nguyen_am[x][0] |
|
if not qu_or_gi or index != 1: |
|
nguyen_am_index.append(index) |
|
if len(nguyen_am_index) < 2: |
|
if qu_or_gi: |
|
if len(chars) == 2: |
|
x, y = self.nguyen_am_to_ids.get(chars[1]) |
|
chars[1] = self.bang_nguyen_am[x][dau_cau] |
|
else: |
|
x, y = self.nguyen_am_to_ids.get(chars[2], (-1, -1)) |
|
if x != -1: |
|
chars[2] = self.bang_nguyen_am[x][dau_cau] |
|
else: |
|
chars[1] = self.bang_nguyen_am[5][dau_cau] if chars[1] == 'i' else self.bang_nguyen_am[9][dau_cau] |
|
return ''.join(chars) |
|
return word |
|
|
|
for index in nguyen_am_index: |
|
x, y = self.nguyen_am_to_ids[chars[index]] |
|
if x == 4 or x == 8: |
|
chars[index] = self.bang_nguyen_am[x][dau_cau] |
|
|
|
|
|
|
|
|
|
return ''.join(chars) |
|
|
|
if len(nguyen_am_index) == 2: |
|
if nguyen_am_index[-1] == len(chars) - 1: |
|
x, y = self.nguyen_am_to_ids[chars[nguyen_am_index[0]]] |
|
chars[nguyen_am_index[0]] = self.bang_nguyen_am[x][dau_cau] |
|
|
|
|
|
else: |
|
|
|
|
|
x, y = self.nguyen_am_to_ids[chars[nguyen_am_index[1]]] |
|
chars[nguyen_am_index[1]] = self.bang_nguyen_am[x][dau_cau] |
|
else: |
|
|
|
|
|
x, y = self.nguyen_am_to_ids[chars[nguyen_am_index[1]]] |
|
chars[nguyen_am_index[1]] = self.bang_nguyen_am[x][dau_cau] |
|
|
|
|
|
return ''.join(chars) |
|
|
|
def chuan_hoa_dau_cau_tieng_viet(self, sentence): |
|
""" |
|
Chuyển câu tiếng việt về chuẩn gõ dấu kiểu cũ. |
|
:param sentence: |
|
:return: |
|
""" |
|
words = sentence.split() |
|
for index, word in enumerate(words): |
|
cw = re.sub(r'(^\p{P}*)([p{L}.]*\p{L}+)(\p{P}*$)', r'\1/\2/\3', word).split('/') |
|
|
|
if len(cw) == 3: |
|
cw[1] = self.chuan_hoa_dau_tu_tieng_viet(cw[1]) |
|
words[index] = ''.join(cw) |
|
return ' '.join(words) |
|
|
|
def normalize(self, text: str, track_change=False): |
|
|
|
text = text.lower() |
|
|
|
text = re.sub(r"((https?|ftp|file):\/{2,3})+([-\w+&@#/%=~|$?!:,.]*)|(www.)+([-\w+&@#/%=~|$?!:,.]*)", "urllink", text) |
|
|
|
|
|
text = re.sub(r"([\D\w])\1+\b", r"\1", text) |
|
if track_change: |
|
print("Dedup trailing: ", text) |
|
|
|
|
|
for pttn, repl in self.sp_word_sub.items(): |
|
text = re.sub(fr"{pttn}", f" {repl} ", text) |
|
if track_change: |
|
print("Replace special word: ", text) |
|
|
|
|
|
def replace(match): |
|
orig = match.group(1) |
|
word = " " + self.replace_dict.get(orig, orig) + " " |
|
return word |
|
text = re.sub(r"\b(\S+)\b", replace, text) |
|
if track_change: |
|
print("Correct misspelled word: ", text) |
|
|
|
|
|
text = self.convert_unicode(text) |
|
if track_change: |
|
print("Normalize string encoding: ", text) |
|
|
|
|
|
text = self.chuan_hoa_dau_cau_tieng_viet(text) |
|
if track_change: |
|
print("Vietnamese unicode normalization: ", text) |
|
|
|
|
|
text = re.sub(r"(?<=\d)\.(?=\d{3})", "", text) |
|
if track_change: |
|
print("Eliminate decimal delimiter: ", text) |
|
|
|
|
|
text = re.sub(r"(\d+)(\D+)", r"\1 \2", text) |
|
if track_change: |
|
print("Split between value and unit: ", text) |
|
|
|
|
|
text = " ".join( |
|
re.split("(["+re.escape("!\"#$%&\'()*+,-./:;<=>?@[\\]^`{|}~")+"])", text) |
|
) |
|
if track_change: |
|
print("Split by punctuations: ", text) |
|
|
|
|
|
text = " ".join( |
|
re.split("([" |
|
u"\U0001F600-\U0001F64F" |
|
u"\U0001F300-\U0001F5FF" |
|
u"\U0001F680-\U0001F6FF" |
|
u"\U0001F1E0-\U0001F1FF" |
|
u"\U00002702-\U000027B0" |
|
u"\U000024C2-\U0001F251" |
|
u"\U0001f926-\U0001f937" |
|
u'\U00010000-\U0010ffff' |
|
u"\u200d" |
|
u"\u2640-\u2642" |
|
u"\u2600-\u2B55" |
|
u"\u23cf" |
|
u"\u23e9" |
|
u"\u231a" |
|
u"\u3030" |
|
u"\ufe0f" |
|
u"\u221a" |
|
"])", text) |
|
) |
|
|
|
|
|
|
|
|
|
return text |