|
import jnius_config
|
|
import os
|
|
import shutil
|
|
|
|
save_dir = "/home2/vietle/icgpt/vncorenlp-1.2"
|
|
max_heap_size='-Xmx4g'
|
|
jnius_config.add_options(max_heap_size)
|
|
jnius_config.set_classpath(save_dir + "/VnCoreNLP-1.2.jar")
|
|
|
|
def download_model(save_dir='./'):
|
|
|
|
if save_dir[-1] == '/':
|
|
save_dir = save_dir[:-1]
|
|
if os.path.isdir(save_dir + "/models") and os.path.exists(save_dir + '/VnCoreNLP-1.2.jar'):
|
|
print("VnCoreNLP model folder " + save_dir + " already exists! Please load VnCoreNLP from this folder!")
|
|
else:
|
|
os.mkdir(save_dir + "/models")
|
|
os.mkdir(save_dir + "/models/dep")
|
|
os.mkdir(save_dir + "/models/ner")
|
|
os.mkdir(save_dir + "/models/postagger")
|
|
os.mkdir(save_dir + "/models/wordsegmenter")
|
|
|
|
os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.2.jar")
|
|
shutil.move("VnCoreNLP-1.2.jar", save_dir + "/VnCoreNLP-1.2.jar")
|
|
|
|
os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab")
|
|
os.system(
|
|
"wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/wordsegmenter.rdr")
|
|
shutil.move("vi-vocab", save_dir + "/models/wordsegmenter/vi-vocab")
|
|
shutil.move("wordsegmenter.rdr", save_dir + "/models/wordsegmenter/wordsegmenter.rdr")
|
|
|
|
os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/postagger/vi-tagger")
|
|
shutil.move("vi-tagger", save_dir + "/models/postagger/vi-tagger")
|
|
|
|
os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/ner/vi-500brownclusters.xz")
|
|
os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/ner/vi-ner.xz")
|
|
os.system(
|
|
"wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/ner/vi-pretrainedembeddings.xz")
|
|
shutil.move("vi-500brownclusters.xz", save_dir + "/models/ner/vi-500brownclusters.xz")
|
|
shutil.move("vi-ner.xz", save_dir + "/models/ner/vi-ner.xz")
|
|
shutil.move("vi-pretrainedembeddings.xz", save_dir + "/models/ner/vi-pretrainedembeddings.xz")
|
|
|
|
os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/dep/vi-dep.xz")
|
|
shutil.move("vi-dep.xz", save_dir + "/models/dep/vi-dep.xz")
|
|
|
|
|
|
class VnCoreNLP:
|
|
def __init__(self, annotators=["wseg", "pos", "ner", "parse"], save_dir = './'):
|
|
if save_dir[-1] == '/':
|
|
save_dir = save_dir[:-1]
|
|
if os.path.isdir(save_dir + "/models") == False or os.path.exists(save_dir + '/VnCoreNLP-1.2.jar') == False:
|
|
raise Exception("Please download the VnCoreNLP model!")
|
|
self.current_working_dir = os.getcwd()
|
|
os.chdir(save_dir)
|
|
|
|
from jnius import autoclass
|
|
javaclass_vncorenlp = autoclass('vn.pipeline.VnCoreNLP')
|
|
self.javaclass_String = autoclass('java.lang.String')
|
|
self.annotators = annotators
|
|
if "wseg" not in annotators:
|
|
self.annotators.append("wseg")
|
|
|
|
self.model = javaclass_vncorenlp(annotators)
|
|
|
|
def annotate_text(self, text):
|
|
from jnius import autoclass
|
|
javaclass_Annotation = autoclass('vn.pipeline.Annotation')
|
|
str = self.javaclass_String(text)
|
|
annotation = javaclass_Annotation(str)
|
|
self.model.annotate(annotation)
|
|
dict_sentences = {}
|
|
list_sentences = annotation.toString().split("\n\n")[:-1]
|
|
for i in range(len(list_sentences)):
|
|
list_words = list_sentences[i].split("\n")
|
|
list_dict_words = []
|
|
for word in list_words:
|
|
dict_word = {}
|
|
word = word.replace("\t\t", "\t")
|
|
list_tags = word.split("\t")
|
|
dict_word["index"] = int(list_tags[0])
|
|
dict_word["wordForm"] = list_tags[1]
|
|
dict_word["posTag"] = list_tags[2]
|
|
dict_word["nerLabel"] = list_tags[3]
|
|
if "parse" in self.annotators:
|
|
dict_word["head"] = int(list_tags[4])
|
|
else:
|
|
dict_word["head"] = list_tags[4]
|
|
dict_word["depLabel"] = list_tags[5]
|
|
list_dict_words.append(dict_word)
|
|
dict_sentences[i] = list_dict_words
|
|
return dict_sentences
|
|
|
|
def tokenize(self, text):
|
|
annotated_sens = self.annotate_text(text=text)
|
|
output = []
|
|
for id_sen in annotated_sens:
|
|
annotated_sen = annotated_sens[id_sen]
|
|
out = [s["wordForm"] for s in annotated_sen]
|
|
output.append(out)
|
|
return output
|
|
|
|
def pos_tag(self, text):
|
|
annotated_sens = self.annotate_text(text=text)
|
|
output = []
|
|
for id_sen in annotated_sens:
|
|
annotated_sen = annotated_sens[id_sen]
|
|
out = [(s["wordForm"], s["posTag"]) for s in annotated_sen]
|
|
output.append(out)
|
|
return output
|
|
|
|
def ner(self, text):
|
|
annotated_sens = self.annotate_text(text=text)
|
|
output = []
|
|
for id_sen in annotated_sens:
|
|
annotated_sen = annotated_sens[id_sen]
|
|
out = [(s["wordForm"], s["nerLabel"]) for s in annotated_sen]
|
|
output.append(out)
|
|
return output
|
|
|
|
def word_segment(self, text):
|
|
from jnius import autoclass
|
|
javaclass_Annotation = autoclass('vn.pipeline.Annotation')
|
|
str = self.javaclass_String(text)
|
|
annotation = javaclass_Annotation(str)
|
|
self.model.annotate(annotation)
|
|
list_segmented_sentences = []
|
|
list_sentences = annotation.toString().split("\n\n")[:-1]
|
|
for sent in list_sentences:
|
|
list_words = sent.split("\n")
|
|
list_segmented_words = []
|
|
for word in list_words:
|
|
word = word.replace("\t\t", "\t")
|
|
list_tags = word.split("\t")
|
|
list_segmented_words.append(list_tags[1])
|
|
list_segmented_sentences.append(" ".join(list_segmented_words))
|
|
return list_segmented_sentences
|
|
|
|
def print_out(self, dict_sentences):
|
|
for sent in dict_sentences.keys():
|
|
list_dict_words = dict_sentences[sent]
|
|
for word in list_dict_words:
|
|
print(str(word["index"]) + "\t" + word["wordForm"] + "\t" + word["posTag"] + "\t" + word["nerLabel"] + "\t" + str(word["head"]) + "\t" + word["depLabel"])
|
|
print("")
|
|
|
|
def annotate_file(self, input_file, output_file):
|
|
os.chdir(self.current_working_dir)
|
|
input_str = self.javaclass_String(input_file)
|
|
output_str = self.javaclass_String(output_file)
|
|
self.model.processPipeline(input_str, output_str, self.annotators)
|
|
|
|
if __name__ == '__main__':
|
|
download_model(save_dir='/home2/vietle/icgpt/vncorenlp-1.2')
|
|
model = VnCoreNLP(annotators=["wseg","pos","ner"], save_dir='/home2/vietle/icgpt/vncorenlp-1.2')
|
|
|
|
|
|
|
|
text = "Sau khi tốt nghiệp Trung học năm 1975, ông theo học dự bị Ngoại ngữ tại Đại học Ngoại ngữ (nay là Trường Đại học Hà Nội)."
|
|
out = model.tokenize(text)
|
|
print(out)
|
|
|
|
|