|
import re |
|
import unicodedata |
|
from string import punctuation |
|
|
|
|
|
def remove_emoticon(text: str): |
|
emoticon_pattern = re.compile(r"(:|;|=|-|@)(\)|]|\(|v|>|<|D|@)+") |
|
text = emoticon_pattern.sub("", text) |
|
return text |
|
|
|
|
|
def remove_emoji(text: str): |
|
emoji_pattern = re.compile( |
|
"[" |
|
"\U0001F600-\U0001F64F" |
|
"\U0001F300-\U0001F5FF" |
|
"\U0001F680-\U0001F6FF" |
|
"\U0001F1E0-\U0001F1FF" |
|
"\U00002500-\U00002BEF" |
|
"\U00002702-\U000027B0" |
|
"\U000024C2-\U0001F251" |
|
"\U0001f926-\U0001f937" |
|
"\U00010000-\U0010ffff" |
|
"\u2640-\u2642" |
|
"\u2600-\u2B55" |
|
"\u200d" |
|
"\u23cf" |
|
"\u23e9" |
|
"\u231a" |
|
"\ufe0f" |
|
"\u3030" |
|
"]+", |
|
re.UNICODE, |
|
) |
|
|
|
text = emoji_pattern.sub("", text) |
|
return text |
|
|
|
|
|
def remove_consecutive_whitespace(text: str): |
|
return " ".join(text.split()) |
|
|
|
|
|
def remove_consecutive_punctuation(text: str): |
|
|
|
pattern = re.compile(r"([%s])\1+" % re.escape(punctuation)) |
|
return pattern.sub(r"\1", text) |
|
|
|
|
|
def normalize_unicode(text: str): |
|
return unicodedata.normalize("NFKC", text) |
|
|
|
|
|
def normalize_accents(text: str): |
|
dict_map = { |
|
"òa": "oà", |
|
"Òa": "Oà", |
|
"ÒA": "OÀ", |
|
"óa": "oá", |
|
"Óa": "Oá", |
|
"ÓA": "OÁ", |
|
"ỏa": "oả", |
|
"Ỏa": "Oả", |
|
"ỎA": "OẢ", |
|
"õa": "oã", |
|
"Õa": "Oã", |
|
"ÕA": "OÃ", |
|
"ọa": "oạ", |
|
"Ọa": "Oạ", |
|
"ỌA": "OẠ", |
|
"òe": "oè", |
|
"Òe": "Oè", |
|
"ÒE": "OÈ", |
|
"óe": "oé", |
|
"Óe": "Oé", |
|
"ÓE": "OÉ", |
|
"ỏe": "oẻ", |
|
"Ỏe": "Oẻ", |
|
"ỎE": "OẺ", |
|
"õe": "oẽ", |
|
"Õe": "Oẽ", |
|
"ÕE": "OẼ", |
|
"ọe": "oẹ", |
|
"Ọe": "Oẹ", |
|
"ỌE": "OẸ", |
|
"ùy": "uỳ", |
|
"Ùy": "Uỳ", |
|
"ÙY": "UỲ", |
|
"úy": "uý", |
|
"Úy": "Uý", |
|
"ÚY": "UÝ", |
|
"ủy": "uỷ", |
|
"Ủy": "Uỷ", |
|
"ỦY": "UỶ", |
|
"ũy": "uỹ", |
|
"Ũy": "Uỹ", |
|
"ŨY": "UỸ", |
|
"ụy": "uỵ", |
|
"Ụy": "Uỵ", |
|
"ỤY": "UỴ", |
|
} |
|
|
|
for k, v in dict_map.items(): |
|
text = re.sub(k, v, text, flags=re.IGNORECASE) |
|
|
|
return text |
|
|
|
|
|
def preprocess_pipeline(text): |
|
text = remove_emoticon(text) |
|
|
|
|
|
text = remove_emoji(text) |
|
|
|
text = normalize_unicode(text) |
|
|
|
|
|
text = normalize_accents(text) |
|
|
|
|
|
text = remove_consecutive_whitespace(text) |
|
|
|
|
|
text = remove_consecutive_punctuation(text) |
|
|
|
return text |
|
|