File size: 7,706 Bytes
5120311 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
import jnius_config
import os
import shutil
save_dir = "/home2/vietle/icgpt/vncorenlp-1.2"
max_heap_size='-Xmx4g'
jnius_config.add_options(max_heap_size)
jnius_config.set_classpath(save_dir + "/VnCoreNLP-1.2.jar")
def download_model(save_dir='./'):
# current_path = os.path.abspath(os.getcwd())
if save_dir[-1] == '/':
save_dir = save_dir[:-1]
if os.path.isdir(save_dir + "/models") and os.path.exists(save_dir + '/VnCoreNLP-1.2.jar'):
print("VnCoreNLP model folder " + save_dir + " already exists! Please load VnCoreNLP from this folder!")
else:
os.mkdir(save_dir + "/models")
os.mkdir(save_dir + "/models/dep")
os.mkdir(save_dir + "/models/ner")
os.mkdir(save_dir + "/models/postagger")
os.mkdir(save_dir + "/models/wordsegmenter")
# jar
os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.2.jar")
shutil.move("VnCoreNLP-1.2.jar", save_dir + "/VnCoreNLP-1.2.jar")
# wordsegmenter
os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab")
os.system(
"wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/wordsegmenter.rdr")
shutil.move("vi-vocab", save_dir + "/models/wordsegmenter/vi-vocab")
shutil.move("wordsegmenter.rdr", save_dir + "/models/wordsegmenter/wordsegmenter.rdr")
# postagger
os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/postagger/vi-tagger")
shutil.move("vi-tagger", save_dir + "/models/postagger/vi-tagger")
# ner
os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/ner/vi-500brownclusters.xz")
os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/ner/vi-ner.xz")
os.system(
"wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/ner/vi-pretrainedembeddings.xz")
shutil.move("vi-500brownclusters.xz", save_dir + "/models/ner/vi-500brownclusters.xz")
shutil.move("vi-ner.xz", save_dir + "/models/ner/vi-ner.xz")
shutil.move("vi-pretrainedembeddings.xz", save_dir + "/models/ner/vi-pretrainedembeddings.xz")
# parse
os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/dep/vi-dep.xz")
shutil.move("vi-dep.xz", save_dir + "/models/dep/vi-dep.xz")
class VnCoreNLP:
def __init__(self, annotators=["wseg", "pos", "ner", "parse"], save_dir = './'):
if save_dir[-1] == '/':
save_dir = save_dir[:-1]
if os.path.isdir(save_dir + "/models") == False or os.path.exists(save_dir + '/VnCoreNLP-1.2.jar') == False:
raise Exception("Please download the VnCoreNLP model!")
self.current_working_dir = os.getcwd()
os.chdir(save_dir)
from jnius import autoclass
javaclass_vncorenlp = autoclass('vn.pipeline.VnCoreNLP')
self.javaclass_String = autoclass('java.lang.String')
self.annotators = annotators
if "wseg" not in annotators:
self.annotators.append("wseg")
self.model = javaclass_vncorenlp(annotators)
def annotate_text(self, text):
from jnius import autoclass
javaclass_Annotation = autoclass('vn.pipeline.Annotation')
str = self.javaclass_String(text)
annotation = javaclass_Annotation(str)
self.model.annotate(annotation)
dict_sentences = {}
list_sentences = annotation.toString().split("\n\n")[:-1]
for i in range(len(list_sentences)):
list_words = list_sentences[i].split("\n")
list_dict_words = []
for word in list_words:
dict_word = {}
word = word.replace("\t\t", "\t")
list_tags = word.split("\t")
dict_word["index"] = int(list_tags[0])
dict_word["wordForm"] = list_tags[1]
dict_word["posTag"] = list_tags[2]
dict_word["nerLabel"] = list_tags[3]
if "parse" in self.annotators:
dict_word["head"] = int(list_tags[4])
else:
dict_word["head"] = list_tags[4]
dict_word["depLabel"] = list_tags[5]
list_dict_words.append(dict_word)
dict_sentences[i] = list_dict_words
return dict_sentences
def tokenize(self, text):
annotated_sens = self.annotate_text(text=text)
output = []
for id_sen in annotated_sens:
annotated_sen = annotated_sens[id_sen]
out = [s["wordForm"] for s in annotated_sen]
output.append(out)
return output
def pos_tag(self, text):
annotated_sens = self.annotate_text(text=text)
output = []
for id_sen in annotated_sens:
annotated_sen = annotated_sens[id_sen]
out = [(s["wordForm"], s["posTag"]) for s in annotated_sen]
output.append(out)
return output
def ner(self, text):
annotated_sens = self.annotate_text(text=text)
output = []
for id_sen in annotated_sens:
annotated_sen = annotated_sens[id_sen]
out = [(s["wordForm"], s["nerLabel"]) for s in annotated_sen]
output.append(out)
return output
def word_segment(self, text):
from jnius import autoclass
javaclass_Annotation = autoclass('vn.pipeline.Annotation')
str = self.javaclass_String(text)
annotation = javaclass_Annotation(str)
self.model.annotate(annotation)
list_segmented_sentences = []
list_sentences = annotation.toString().split("\n\n")[:-1]
for sent in list_sentences:
list_words = sent.split("\n")
list_segmented_words = []
for word in list_words:
word = word.replace("\t\t", "\t")
list_tags = word.split("\t")
list_segmented_words.append(list_tags[1])
list_segmented_sentences.append(" ".join(list_segmented_words))
return list_segmented_sentences
def print_out(self, dict_sentences):
for sent in dict_sentences.keys():
list_dict_words = dict_sentences[sent]
for word in list_dict_words:
print(str(word["index"]) + "\t" + word["wordForm"] + "\t" + word["posTag"] + "\t" + word["nerLabel"] + "\t" + str(word["head"]) + "\t" + word["depLabel"])
print("")
def annotate_file(self, input_file, output_file):
os.chdir(self.current_working_dir)
input_str = self.javaclass_String(input_file)
output_str = self.javaclass_String(output_file)
self.model.processPipeline(input_str, output_str, self.annotators)
if __name__ == '__main__':
download_model(save_dir='/home2/vietle/icgpt/vncorenlp-1.2')
model = VnCoreNLP(annotators=["wseg","pos","ner"], save_dir='/home2/vietle/icgpt/vncorenlp-1.2')
# output = model.annotate_text("Ông Nguyễn Khắc Chúc đang làm việc tại Đại học Quốc gia Hà Nội. Bà Lan, vợ ông Chúc, cũng làm việc tại đây.")
# print(output)
text = "Sau khi tốt nghiệp Trung học năm 1975, ông theo học dự bị Ngoại ngữ tại Đại học Ngoại ngữ (nay là Trường Đại học Hà Nội)."
out = model.tokenize(text)
print(out)
# model.print_out(output)
|