Text-Summarizer / src /utils.py
Gladiator's picture
add clean text func
85ebc15
raw
history blame
916 Bytes
import re
emoji_pattern = re.compile(
"["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+",
flags=re.UNICODE,
)
def clean_text(x):
x = x.lower() # lowercase
x = x.encode("ascii", "ignore").decode() # unicode
x = re.sub(r"https*\S+", " ", x) # url
x = re.sub(r"@\S+", " ", x) # mentions
x = re.sub(r"#\S+", " ", x) # hastags
x = x.replace("'", "") # remove ticks
# x = re.sub("[%s]" % re.escape(string.punctuation), " ", x) # punctuation
# x = re.sub(r"\w*\d+\w*", "", x) # numbers
x = re.sub(r"\s{2,}", " ", x) # over spaces
x = emoji_pattern.sub(r"", x) # emojis
x = re.sub("[^A-Za-z0-9]+", " ", x) # special charachters
return x