import re | |
import string | |
from pyvi.ViTokenizer import tokenize | |
import bm25s | |
def clean_text(text): | |
text = re.sub('<.*?>', '', text).strip() | |
text = re.sub(r'(\s)+', r'\1', text) | |
return text | |
def normalize_text(text): | |
listpunctuation = string.punctuation.replace('_', '') | |
for i in listpunctuation: | |
text = text.replace(i, ' ') | |
return text.lower().strip() | |
def process_text(text): | |
text = clean_text(text) | |
text = tokenize(text) | |
text = normalize_text(text) | |
return text |