vi-en-mt-teencode-slang / preprocessing.py
nguyen1207's picture
initial commit
6136947
import re
import unicodedata
from string import punctuation
def remove_emoticon(text: str):
emoticon_pattern = re.compile(r"(:|;|=|-|@)(\)|]|\(|v|>|<|D|@)+")
text = emoticon_pattern.sub("", text)
return text
def remove_emoji(text: str):
emoji_pattern = re.compile(
"["
"\U0001F600-\U0001F64F" # emoticons
"\U0001F300-\U0001F5FF" # symbols & pictographs
"\U0001F680-\U0001F6FF" # transport & map symbols
"\U0001F1E0-\U0001F1FF" # flags (iOS)
"\U00002500-\U00002BEF" # chinese char
"\U00002702-\U000027B0"
"\U000024C2-\U0001F251"
"\U0001f926-\U0001f937"
"\U00010000-\U0010ffff"
"\u2640-\u2642"
"\u2600-\u2B55"
"\u200d"
"\u23cf"
"\u23e9"
"\u231a"
"\ufe0f" # dingbats
"\u3030"
"]+",
re.UNICODE,
)
text = emoji_pattern.sub("", text)
return text
def remove_consecutive_whitespace(text: str):
return " ".join(text.split())
def remove_consecutive_punctuation(text: str):
# only keep one punctuation
pattern = re.compile(r"([%s])\1+" % re.escape(punctuation))
return pattern.sub(r"\1", text)
def normalize_unicode(text: str):
return unicodedata.normalize("NFKC", text)
def normalize_accents(text: str):
dict_map = {
"òa": "oà",
"Òa": "Oà",
"ÒA": "OÀ",
"óa": "oá",
"Óa": "Oá",
"ÓA": "OÁ",
"ỏa": "oả",
"Ỏa": "Oả",
"ỎA": "OẢ",
"õa": "oã",
"Õa": "Oã",
"ÕA": "OÃ",
"ọa": "oạ",
"Ọa": "Oạ",
"ỌA": "OẠ",
"òe": "oè",
"Òe": "Oè",
"ÒE": "OÈ",
"óe": "oé",
"Óe": "Oé",
"ÓE": "OÉ",
"ỏe": "oẻ",
"Ỏe": "Oẻ",
"ỎE": "OẺ",
"õe": "oẽ",
"Õe": "Oẽ",
"ÕE": "OẼ",
"ọe": "oẹ",
"Ọe": "Oẹ",
"ỌE": "OẸ",
"ùy": "uỳ",
"Ùy": "Uỳ",
"ÙY": "UỲ",
"úy": "uý",
"Úy": "Uý",
"ÚY": "UÝ",
"ủy": "uỷ",
"Ủy": "Uỷ",
"ỦY": "UỶ",
"ũy": "uỹ",
"Ũy": "Uỹ",
"ŨY": "UỸ",
"ụy": "uỵ",
"Ụy": "Uỵ",
"ỤY": "UỴ",
}
for k, v in dict_map.items():
text = re.sub(k, v, text, flags=re.IGNORECASE)
return text
def preprocess_pipeline(text):
text = remove_emoticon(text)
# remove emojis
text = remove_emoji(text)
# normalize unicode
text = normalize_unicode(text)
# normalize accents
text = normalize_accents(text)
# remove consecutive whitespace
text = remove_consecutive_whitespace(text)
# remove consecutive punctuation
text = remove_consecutive_punctuation(text)
return text