Update utils.py
Browse files
utils.py
CHANGED
@@ -1,8 +1,5 @@
|
|
1 |
-
import jdk
|
2 |
-
jdk.install('11', jre=True)
|
3 |
from imports import *
|
4 |
import unicodedata
|
5 |
-
rdrsegmenter = VnCoreNLP("./vncorenlp_segmenter/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m')
|
6 |
dict_map = {
|
7 |
"òa": "oà",
|
8 |
"Òa": "Oà",
|
@@ -59,7 +56,8 @@ def replace_all(text, dict_map=dict_map):
|
|
59 |
def normalize(text, segment=True):
|
60 |
text = replace_all(text, dict_map)
|
61 |
if segment:
|
62 |
-
text =
|
|
|
63 |
return text
|
64 |
def text_preprocess(document):
|
65 |
punc = [i for i in ["\"", "-", ".", ":"]]#string.punctuation.replace(",","")]
|
@@ -85,7 +83,8 @@ def text_preprocess(document):
|
|
85 |
document = re.sub(" ", " ", document)
|
86 |
document = re.sub(" ", " ", document)
|
87 |
try:
|
88 |
-
document =
|
|
|
89 |
except:
|
90 |
pass
|
91 |
return document.lower()
|
|
|
|
|
|
|
1 |
from imports import *
|
2 |
import unicodedata
|
|
|
3 |
dict_map = {
|
4 |
"òa": "oà",
|
5 |
"Òa": "Oà",
|
|
|
56 |
def normalize(text, segment=True):
|
57 |
text = replace_all(text, dict_map)
|
58 |
if segment:
|
59 |
+
text = text.split(".")
|
60 |
+
text = ". ".join([underthesea.word_tokenize(i, format="text") for i in text)])
|
61 |
return text
|
62 |
def text_preprocess(document):
|
63 |
punc = [i for i in ["\"", "-", ".", ":"]]#string.punctuation.replace(",","")]
|
|
|
83 |
document = re.sub(" ", " ", document)
|
84 |
document = re.sub(" ", " ", document)
|
85 |
try:
|
86 |
+
document = document.split(".")
|
87 |
+
document = ". ".join([underthesea.word_tokenize(i, format="text") for i in document)])
|
88 |
except:
|
89 |
pass
|
90 |
return document.lower()
|