import re import unicodedata from string import punctuation def remove_emoticon(text: str): emoticon_pattern = re.compile(r"(:|;|=|-|@)(\)|]|\(|v|>|<|D|@)+") text = emoticon_pattern.sub("", text) return text def remove_emoji(text: str): emoji_pattern = re.compile( "[" "\U0001F600-\U0001F64F" # emoticons "\U0001F300-\U0001F5FF" # symbols & pictographs "\U0001F680-\U0001F6FF" # transport & map symbols "\U0001F1E0-\U0001F1FF" # flags (iOS) "\U00002500-\U00002BEF" # chinese char "\U00002702-\U000027B0" "\U000024C2-\U0001F251" "\U0001f926-\U0001f937" "\U00010000-\U0010ffff" "\u2640-\u2642" "\u2600-\u2B55" "\u200d" "\u23cf" "\u23e9" "\u231a" "\ufe0f" # dingbats "\u3030" "]+", re.UNICODE, ) text = emoji_pattern.sub("", text) return text def remove_consecutive_whitespace(text: str): return " ".join(text.split()) def remove_consecutive_punctuation(text: str): # only keep one punctuation pattern = re.compile(r"([%s])\1+" % re.escape(punctuation)) return pattern.sub(r"\1", text) def normalize_unicode(text: str): return unicodedata.normalize("NFKC", text) def normalize_accents(text: str): dict_map = { "òa": "oà", "Òa": "Oà", "ÒA": "OÀ", "óa": "oá", "Óa": "Oá", "ÓA": "OÁ", "ỏa": "oả", "Ỏa": "Oả", "ỎA": "OẢ", "õa": "oã", "Õa": "Oã", "ÕA": "OÃ", "ọa": "oạ", "Ọa": "Oạ", "ỌA": "OẠ", "òe": "oè", "Òe": "Oè", "ÒE": "OÈ", "óe": "oé", "Óe": "Oé", "ÓE": "OÉ", "ỏe": "oẻ", "Ỏe": "Oẻ", "ỎE": "OẺ", "õe": "oẽ", "Õe": "Oẽ", "ÕE": "OẼ", "ọe": "oẹ", "Ọe": "Oẹ", "ỌE": "OẸ", "ùy": "uỳ", "Ùy": "Uỳ", "ÙY": "UỲ", "úy": "uý", "Úy": "Uý", "ÚY": "UÝ", "ủy": "uỷ", "Ủy": "Uỷ", "ỦY": "UỶ", "ũy": "uỹ", "Ũy": "Uỹ", "ŨY": "UỸ", "ụy": "uỵ", "Ụy": "Uỵ", "ỤY": "UỴ", } for k, v in dict_map.items(): text = re.sub(k, v, text, flags=re.IGNORECASE) return text def preprocess_pipeline(text): text = remove_emoticon(text) # remove emojis text = remove_emoji(text) # normalize unicode text = normalize_unicode(text) # normalize accents text = normalize_accents(text) # remove consecutive whitespace text = remove_consecutive_whitespace(text) # remove consecutive punctuation text = remove_consecutive_punctuation(text) return text