Update utils.py
Browse files
utils.py
CHANGED
@@ -57,7 +57,7 @@ def normalize(text, segment=True):
|
|
57 |
text = replace_all(text, dict_map)
|
58 |
if segment:
|
59 |
text = text.split(".")
|
60 |
-
text = ". ".join([underthesea.word_tokenize(i, format="text") for i in text
|
61 |
return text
|
62 |
def text_preprocess(document):
|
63 |
punc = [i for i in ["\"", "-", ".", ":"]]#string.punctuation.replace(",","")]
|
@@ -84,7 +84,7 @@ def text_preprocess(document):
|
|
84 |
document = re.sub(" ", " ", document)
|
85 |
try:
|
86 |
document = document.split(".")
|
87 |
-
document = ". ".join([underthesea.word_tokenize(i, format="text") for i in document
|
88 |
except:
|
89 |
pass
|
90 |
return document.lower()
|
|
|
57 |
text = replace_all(text, dict_map)
|
58 |
if segment:
|
59 |
text = text.split(".")
|
60 |
+
text = ". ".join([underthesea.word_tokenize(i, format="text") for i in text])
|
61 |
return text
|
62 |
def text_preprocess(document):
|
63 |
punc = [i for i in ["\"", "-", ".", ":"]]#string.punctuation.replace(",","")]
|
|
|
84 |
document = re.sub(" ", " ", document)
|
85 |
try:
|
86 |
document = document.split(".")
|
87 |
+
document = ". ".join([underthesea.word_tokenize(i, format="text") for i in document])
|
88 |
except:
|
89 |
pass
|
90 |
return document.lower()
|