import torch from typing import Dict from transformers import PreTrainedTokenizer from tokenizers.implementations import CharBPETokenizer from tokenizers.processors import TemplateProcessing import regex as re from typing import Tuple, Optional import shutil import os import requests class VnSmartphoneAbsaTokenizer(PreTrainedTokenizer): vocab_files_names = { "vocab_file": "vocab.txt", "merge_file": "merge.txt", } pretrained_vocab_files_map = { "vocab_file": "https://huggingface.co/ptdat/vn-smartphone-absa/resolve/main/vocab.txt", "merge_file": "https://huggingface.co/ptdat/vn-smartphone-absa/resolve/main/merge.txt" } model_input_names = ["input_ids", "attention_mask"] def __init__( self, vocab_file, merge_file, bos_token="~~", eos_token="~~", sep_token="", cls_token="", unk_token="", pad_token="", mask_token="", **kwargs ): self.vocab_file = vocab_file self.merge_file = merge_file self.tokenizer = CharBPETokenizer(vocab_file, merge_file, lowercase=True, bert_normalizer=False, split_on_whitespace_only=True) self.tokenizer.post_processor = TemplateProcessing( single=" $9 ", pair=" $A $B:1 :1", special_tokens=[ ("~~", 2), ("~~", 3) ] ) self.tokenizer.enable_padding(pad_token="") self.encoder = self.tokenizer.get_vocab() self.decoder = {v: k for k, v in self.encoder.items()} self.prepare_preprocess() super().__init__( bos_token=bos_token, eos_token=eos_token, sep_token=sep_token, cls_token=cls_token, unk_token=unk_token, pad_token=pad_token, mask_token=mask_token, **kwargs ) def _tokenize(self, text: str): text = self.normalize(text) return self.tokenizer.encode(text).tokens def get_vocab(self) -> Dict[str, int]: return self.tokenizer.get_vocab() @property def vocab_size(self): return self.tokenizer.get_vocab_size() def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: if not os.path.isdir(save_directory): return out_vocab_file = os.path.join( save_directory, (filename_prefix + "-" if filename_prefix else "") + "vocab.txt" ) out_merge_file = os.path.join( save_directory, (filename_prefix + "-" if filename_prefix else "") + "merge.txt" ) if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): shutil.copyfile(self.vocab_file, out_vocab_file) elif not os.path.isfile(self.vocab_file): with open(out_vocab_file, "wb") as fi: content_spiece_model = self.sp_model.serialized_model_proto() fi.write(content_spiece_model) if os.path.abspath(self.merge_file) != os.path.abspath(out_merge_file): shutil.copyfile(self.merge_file, out_merge_file) return out_vocab_file, out_merge_file def _convert_token_to_id(self, token: str): return self.encoder.get(token, self.encoder[self.unk_token]) def _convert_id_to_token(self, id: int): return self.decoder.get(id, self.unk_token) def prepare_preprocess(self): self.uniChars = "àáảãạâầấẩẫậăằắẳẵặèéẻẽẹêềếểễệđìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵÀÁẢÃẠÂẦẤẨẪẬĂẰẮẲẴẶÈÉẺẼẸÊỀẾỂỄỆĐÌÍỈĨỊÒÓỎÕỌÔỒỐỔỖỘƠỜỚỞỠỢÙÚỦŨỤƯỪỨỬỮỰỲÝỶỸỴÂĂĐÔƠƯ" self.unsignChars = "aaaaaaaaaaaaaaaaaeeeeeeeeeeediiiiiooooooooooooooooouuuuuuuuuuuyyyyyAAAAAAAAAAAAAAAAAEEEEEEEEEEEDIIIOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUYYYYYAADOOU" self.dict_char = {} char1252 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'.split( '|') charutf8 = "à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ".split( '|') for i in range(len(char1252)): self.dict_char[char1252[i]] = charutf8[i] self.bang_nguyen_am = [['a', 'à', 'á', 'ả', 'ã', 'ạ', 'a'], ['ă', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ', 'aw'], ['â', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ', 'aa'], ['e', 'è', 'é', 'ẻ', 'ẽ', 'ẹ', 'e'], ['ê', 'ề', 'ế', 'ể', 'ễ', 'ệ', 'ee'], ['i', 'ì', 'í', 'ỉ', 'ĩ', 'ị', 'i'], ['o', 'ò', 'ó', 'ỏ', 'õ', 'ọ', 'o'], ['ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ', 'oo'], ['ơ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ', 'ow'], ['u', 'ù', 'ú', 'ủ', 'ũ', 'ụ', 'u'], ['ư', 'ừ', 'ứ', 'ử', 'ữ', 'ự', 'uw'], ['y', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ', 'y']] self.bang_ky_tu_dau = ['', 'f', 's', 'r', 'x', 'j'] self.nguyen_am_to_ids = {} for i in range(len(self.bang_nguyen_am)): for j in range(len(self.bang_nguyen_am[i]) - 1): self.nguyen_am_to_ids[self.bang_nguyen_am[i][j]] = (i, j) self.sp_word_sub = { "@@": "confuseeyes", "℅": "%", r"/": " fraction ", r":\)+": "smileface", r";\)+": "smileface", r":\*+": "kissingface", r"=\)+": "playfulsmileface", r"=\(+": "playfulsadface", r":\(+": "sadface", r":3+": "threeface", r":v+": "vface", r"\^\^": "kindsmile", r"\^_\^": "kindmountsmile", r"\^\.\^": "kindmountsmile", r"-_-": "disapointface", r"\._\.": "confusedface", r":>+": "cutesmile", r"(\|)w(\|)": "fancycryface", r":\|": "mutedface", r":d+": "laughface", r"<3": "loveicon", r"\.{2,}": "threedot", r"-{1,}>{1,}": "arrow", r"={1,}>{1,}": "arrow", r"(\d+)h": r"\1 giờ", r"(\d+)'": r"\1 phút", r"(\d+)trieu": r"\1 triệu", r"(\d+)\s?tr": r"\1 triệu", r"blut\w+": "bluetooth", r"(\d+)\s\*": r"\1 sao" } self.replace_dict = { "/": "fraction", "wf": "wifi", "wifj": "wifi", "wjfj": "wifi", "wjfi": "wifi", "wiffi": "wifi", "wj": "wifi", "ko": "không", "k": "không", "hong": "không", "đc": "được", "sp": "sản phẩm", "fb": "facebook", "ytb": "youtube", "yt": "youtube", "mes": "messenger", "mess": "messenger", "tgdđ": "thegioididong", "nv": "nhân viên", "ss": "samsung", "ip": "iphone", "appel": "apple", "oke": "ok", "okie": "ok", "okey": "ok", "oki": "ok", "oce": "ok", "okela": "ok", "mk": "mình", "sd": "sử dụng", "sdung": "sử dụng", "ae": "anh em", "lq": "liên quân", "lqmb": "liên quân mobile", "lun": "luôn", "ng": "người", "ad": "admin", "ms": "mới", "cx": "cũng", "cũg": "cũng", "nhìu": "nhiều", "bth": "bình thường", "bthg": "bình thường", "ngta": "người ta", "dow": "download", "hdh": "hệ điều hành", "hđh": "hệ điều hành", "cammera": "camera", "dt": "điện thoại", "dthoai": "điện thoại", "dth": "điện thoại", "đth": "điện thoại", "hk": "không", "j": "gì", "ji": "gì", "mn": "mọi người", "m.n": "mọi người", "mjh": "mình", "mjk": "mình", "lắc": "lag", "lác": "lag", "lang": "lag", "nhah": "nhanh", "nóichung": "nói chung", "zl": "zalo", "sóg": "sóng", "rẽ": "rẻ", "trc": "trước", "chíp": "chip", "bin": "pin", "lm": "làm", "bik": "biết", "hog": "không", "zỏm": "dổm", "z": "vậy", "v": "vậy", "nhah": "nhanh", "r": "rồi", "ỗn": "ổn", "nhìu": "nhiều", "wá": "quá", "wep": "web", "wed": "web", "fim": "phim", "film": "phim", "xạc": "sạc", "xài": "sài", "het": "hết", "lun": "luôn", "e": "em", "a": "anh", "bjo": "bây giờ", "vl": "vãi lồn", "sac": "sạc", "vidieo": "video", "tét": "test", "tes": "test", "thik": "thích", "fai": "phải", "✋": "tay", "🔋": "pin", "☆": "sao", "supper": "super", "lổi": "lỗi", "loát": "load", "thui": "thôi", "rùi": "rồi", "ỗn": "ổn", "lổi": "lỗi", "suống": "xuống", "selfi": "selfie", "gg": "google", "cam on": "cảm ơn", "tg": "thời gian", "nchung": "nói chung", "❤": "loveicon", "trại nghiệm": "trải nghiệm", "dất": "rất", "đứg": "đứng", "bằg": "bằng", "mìh": "mình", "đag": "đang", "thoi": "thôi", "củng": "cũng", "đả": "đã", "màng": "màn", "ff": "free fire", "cod": "call of duty", "moi thứ": "mọi thứ", "moi thu": "mọi thứ", "moi thư": "mọi thứ", "moi người": "mọi người", "moi": "mới", "dk": "được", "đk": "được", "nhậy": "nhạy", "ak": "á", "ghe": "nghe", "bùn": "buồn", "bit": "biết", "bít": "biết", "bnhieu": "bao nhiêu", "dụg": "dụng", "tk": "tài khoản", "sąc": "sạc", "rât": "rât", "haz": "haiz", "sai làm": "sai lầm", "flim": "film", "xướt": "xước", "viềng": "viền" } def convert_unicode(self, text: str): return re.sub( r'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ', lambda x: self.dict_char[x.group()], text ) def is_valid_vietnam_word(self, word): chars = list(word) nguyen_am_index = -1 for index, char in enumerate(chars): x, y = self.nguyen_am_to_ids.get(char, (-1, -1)) if x != -1: if nguyen_am_index == -1: nguyen_am_index = index else: if index - nguyen_am_index != 1: return False nguyen_am_index = index return True def chuan_hoa_dau_tu_tieng_viet(self, word): if not self.is_valid_vietnam_word(word): return word chars = list(word) dau_cau = 0 nguyen_am_index = [] qu_or_gi = False for index, char in enumerate(chars): x, y = self.nguyen_am_to_ids.get(char, (-1, -1)) if x == -1: continue elif x == 9: # check qu if index != 0 and chars[index - 1] == 'q': chars[index] = 'u' qu_or_gi = True elif x == 5: # check gi if index != 0 and chars[index - 1] == 'g': chars[index] = 'i' qu_or_gi = True if y != 0: dau_cau = y chars[index] = self.bang_nguyen_am[x][0] if not qu_or_gi or index != 1: nguyen_am_index.append(index) if len(nguyen_am_index) < 2: if qu_or_gi: if len(chars) == 2: x, y = self.nguyen_am_to_ids.get(chars[1]) chars[1] = self.bang_nguyen_am[x][dau_cau] else: x, y = self.nguyen_am_to_ids.get(chars[2], (-1, -1)) if x != -1: chars[2] = self.bang_nguyen_am[x][dau_cau] else: chars[1] = self.bang_nguyen_am[5][dau_cau] if chars[1] == 'i' else self.bang_nguyen_am[9][dau_cau] return ''.join(chars) return word for index in nguyen_am_index: x, y = self.nguyen_am_to_ids[chars[index]] if x == 4 or x == 8: # ê, ơ chars[index] = self.bang_nguyen_am[x][dau_cau] # for index2 in nguyen_am_index: # if index2 != index: # x, y = nguyen_am_to_ids[chars[index]] # chars[index2] = bang_nguyen_am[x][0] return ''.join(chars) if len(nguyen_am_index) == 2: if nguyen_am_index[-1] == len(chars) - 1: x, y = self.nguyen_am_to_ids[chars[nguyen_am_index[0]]] chars[nguyen_am_index[0]] = self.bang_nguyen_am[x][dau_cau] # x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]] # chars[nguyen_am_index[1]] = bang_nguyen_am[x][0] else: # x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]] # chars[nguyen_am_index[0]] = bang_nguyen_am[x][0] x, y = self.nguyen_am_to_ids[chars[nguyen_am_index[1]]] chars[nguyen_am_index[1]] = self.bang_nguyen_am[x][dau_cau] else: # x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]] # chars[nguyen_am_index[0]] = bang_nguyen_am[x][0] x, y = self.nguyen_am_to_ids[chars[nguyen_am_index[1]]] chars[nguyen_am_index[1]] = self.bang_nguyen_am[x][dau_cau] # x, y = nguyen_am_to_ids[chars[nguyen_am_index[2]]] # chars[nguyen_am_index[2]] = bang_nguyen_am[x][0] return ''.join(chars) def chuan_hoa_dau_cau_tieng_viet(self, sentence): """ Chuyển câu tiếng việt về chuẩn gõ dấu kiểu cũ. :param sentence: :return: """ words = sentence.split() for index, word in enumerate(words): cw = re.sub(r'(^\p{P}*)([p{L}.]*\p{L}+)(\p{P}*$)', r'\1/\2/\3', word).split('/') # print(cw) if len(cw) == 3: cw[1] = self.chuan_hoa_dau_tu_tieng_viet(cw[1]) words[index] = ''.join(cw) return ' '.join(words) def normalize(self, text: str, track_change=False): # Lowercase text = text.lower() text = re.sub(r"((https?|ftp|file):\/{2,3})+([-\w+&@#/%=~|$?!:,.]*)|(www.)+([-\w+&@#/%=~|$?!:,.]*)", "urllink", text) # Remove dup trailing chars (troiiiii -> troi) text = re.sub(r"([\D\w])\1+\b", r"\1", text) if track_change: print("Dedup trailing: ", text) # Replace special symbol to word for pttn, repl in self.sp_word_sub.items(): text = re.sub(fr"{pttn}", f" {repl} ", text) if track_change: print("Replace special word: ", text) # Correct misspelled word def replace(match): orig = match.group(1) word = " " + self.replace_dict.get(orig, orig) + " " return word text = re.sub(r"\b(\S+)\b", replace, text) if track_change: print("Correct misspelled word: ", text) # Normalize string encoding text = self.convert_unicode(text) if track_change: print("Normalize string encoding: ", text) # Vietnamese unicode normalization text = self.chuan_hoa_dau_cau_tieng_viet(text) if track_change: print("Vietnamese unicode normalization: ", text) # Eliminate decimal delimiter (9.000 -> 9000) text = re.sub(r"(?<=\d)\.(?=\d{3})", "", text) if track_change: print("Eliminate decimal delimiter: ", text) # Split between value and unit (300km -> 300 km) text = re.sub(r"(\d+)(\D+)", r"\1 \2", text) if track_change: print("Split between value and unit: ", text) # Split by punctuations text = " ".join( re.split("(["+re.escape("!\"#$%&\'()*+,-./:;<=>?@[\\]^`{|}~")+"])", text) ) if track_change: print("Split by punctuations: ", text) # Split by emoticons text = " ".join( re.split("([" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" u"\U0001f926-\U0001f937" u'\U00010000-\U0010ffff' u"\u200d" u"\u2640-\u2642" u"\u2600-\u2B55" u"\u23cf" u"\u23e9" u"\u231a" u"\u3030" u"\ufe0f" u"\u221a" "])", text) ) # Word segmentation # text = " ".join(vncorenlp.word_segment(text)) return text