File size: 582 Bytes
c37e08c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 |
import re , string
from textacy.preprocessing.resources import (RE_EMAIL , RE_URL , RE_NUMBER ,
RE_NUMBER , RE_EMOJI , RE_SHORT_URL , RE_PHONE_NUMBER
)
NON_ARABIC_RE = re.compile(r"[%s]"%string.ascii_letters)
def clean_text(text:str)->str:
'''remove unwanted data'''
patterns = [RE_EMAIL , RE_EMOJI , RE_NUMBER , RE_PHONE_NUMBER , RE_SHORT_URL , RE_URL , NON_ARABIC_RE]
for pattern in patterns:
text = pattern.sub("" , text)
return text
|