File size: 509 Bytes
74b1bac |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 |
import re
import string
from pyvi.ViTokenizer import tokenize
import bm25s
def clean_text(text):
text = re.sub('<.*?>', '', text).strip()
text = re.sub(r'(\s)+', r'\1', text)
return text
def normalize_text(text):
listpunctuation = string.punctuation.replace('_', '')
for i in listpunctuation:
text = text.replace(i, ' ')
return text.lower().strip()
def process_text(text):
text = clean_text(text)
text = tokenize(text)
text = normalize_text(text)
return text |