Spaces:
Sleeping
Sleeping
import re | |
import unicodedata | |
from string import punctuation | |
def remove_emoticon(text: str): | |
emoticon_pattern = re.compile(r"(:|;|=|-|@)(\)|]|\(|v|>|<|D|@)+") | |
text = emoticon_pattern.sub("", text) | |
return text | |
def remove_emoji(text: str): | |
emoji_pattern = re.compile( | |
"[" | |
"\U0001F600-\U0001F64F" # emoticons | |
"\U0001F300-\U0001F5FF" # symbols & pictographs | |
"\U0001F680-\U0001F6FF" # transport & map symbols | |
"\U0001F1E0-\U0001F1FF" # flags (iOS) | |
"\U00002500-\U00002BEF" # chinese char | |
"\U00002702-\U000027B0" | |
"\U000024C2-\U0001F251" | |
"\U0001f926-\U0001f937" | |
"\U00010000-\U0010ffff" | |
"\u2640-\u2642" | |
"\u2600-\u2B55" | |
"\u200d" | |
"\u23cf" | |
"\u23e9" | |
"\u231a" | |
"\ufe0f" # dingbats | |
"\u3030" | |
"]+", | |
re.UNICODE, | |
) | |
text = emoji_pattern.sub("", text) | |
return text | |
def remove_consecutive_whitespace(text: str): | |
return " ".join(text.split()) | |
def remove_consecutive_punctuation(text: str): | |
# only keep one punctuation | |
pattern = re.compile(r"([%s])\1+" % re.escape(punctuation)) | |
return pattern.sub(r"\1", text) | |
def normalize_unicode(text: str): | |
return unicodedata.normalize("NFKC", text) | |
def normalize_accents(text: str): | |
dict_map = { | |
"òa": "oà", | |
"Òa": "Oà", | |
"ÒA": "OÀ", | |
"óa": "oá", | |
"Óa": "Oá", | |
"ÓA": "OÁ", | |
"ỏa": "oả", | |
"Ỏa": "Oả", | |
"ỎA": "OẢ", | |
"õa": "oã", | |
"Õa": "Oã", | |
"ÕA": "OÃ", | |
"ọa": "oạ", | |
"Ọa": "Oạ", | |
"ỌA": "OẠ", | |
"òe": "oè", | |
"Òe": "Oè", | |
"ÒE": "OÈ", | |
"óe": "oé", | |
"Óe": "Oé", | |
"ÓE": "OÉ", | |
"ỏe": "oẻ", | |
"Ỏe": "Oẻ", | |
"ỎE": "OẺ", | |
"õe": "oẽ", | |
"Õe": "Oẽ", | |
"ÕE": "OẼ", | |
"ọe": "oẹ", | |
"Ọe": "Oẹ", | |
"ỌE": "OẸ", | |
"ùy": "uỳ", | |
"Ùy": "Uỳ", | |
"ÙY": "UỲ", | |
"úy": "uý", | |
"Úy": "Uý", | |
"ÚY": "UÝ", | |
"ủy": "uỷ", | |
"Ủy": "Uỷ", | |
"ỦY": "UỶ", | |
"ũy": "uỹ", | |
"Ũy": "Uỹ", | |
"ŨY": "UỸ", | |
"ụy": "uỵ", | |
"Ụy": "Uỵ", | |
"ỤY": "UỴ", | |
} | |
for k, v in dict_map.items(): | |
text = re.sub(k, v, text, flags=re.IGNORECASE) | |
return text | |
def preprocess_pipeline(text): | |
text = remove_emoticon(text) | |
# remove emojis | |
text = remove_emoji(text) | |
# normalize unicode | |
text = normalize_unicode(text) | |
# normalize accents | |
text = normalize_accents(text) | |
# remove consecutive whitespace | |
text = remove_consecutive_whitespace(text) | |
# remove consecutive punctuation | |
text = remove_consecutive_punctuation(text) | |
return text | |