cuongnguyen910's picture
Upload folder using huggingface_hub
5120311 verified
import jnius_config
import os
import shutil
save_dir = "/home2/vietle/icgpt/vncorenlp-1.2"
max_heap_size='-Xmx4g'
jnius_config.add_options(max_heap_size)
jnius_config.set_classpath(save_dir + "/VnCoreNLP-1.2.jar")
def download_model(save_dir='./'):
# current_path = os.path.abspath(os.getcwd())
if save_dir[-1] == '/':
save_dir = save_dir[:-1]
if os.path.isdir(save_dir + "/models") and os.path.exists(save_dir + '/VnCoreNLP-1.2.jar'):
print("VnCoreNLP model folder " + save_dir + " already exists! Please load VnCoreNLP from this folder!")
else:
os.mkdir(save_dir + "/models")
os.mkdir(save_dir + "/models/dep")
os.mkdir(save_dir + "/models/ner")
os.mkdir(save_dir + "/models/postagger")
os.mkdir(save_dir + "/models/wordsegmenter")
# jar
os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.2.jar")
shutil.move("VnCoreNLP-1.2.jar", save_dir + "/VnCoreNLP-1.2.jar")
# wordsegmenter
os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab")
os.system(
"wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/wordsegmenter.rdr")
shutil.move("vi-vocab", save_dir + "/models/wordsegmenter/vi-vocab")
shutil.move("wordsegmenter.rdr", save_dir + "/models/wordsegmenter/wordsegmenter.rdr")
# postagger
os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/postagger/vi-tagger")
shutil.move("vi-tagger", save_dir + "/models/postagger/vi-tagger")
# ner
os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/ner/vi-500brownclusters.xz")
os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/ner/vi-ner.xz")
os.system(
"wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/ner/vi-pretrainedembeddings.xz")
shutil.move("vi-500brownclusters.xz", save_dir + "/models/ner/vi-500brownclusters.xz")
shutil.move("vi-ner.xz", save_dir + "/models/ner/vi-ner.xz")
shutil.move("vi-pretrainedembeddings.xz", save_dir + "/models/ner/vi-pretrainedembeddings.xz")
# parse
os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/dep/vi-dep.xz")
shutil.move("vi-dep.xz", save_dir + "/models/dep/vi-dep.xz")
class VnCoreNLP:
def __init__(self, annotators=["wseg", "pos", "ner", "parse"], save_dir = './'):
if save_dir[-1] == '/':
save_dir = save_dir[:-1]
if os.path.isdir(save_dir + "/models") == False or os.path.exists(save_dir + '/VnCoreNLP-1.2.jar') == False:
raise Exception("Please download the VnCoreNLP model!")
self.current_working_dir = os.getcwd()
os.chdir(save_dir)
from jnius import autoclass
javaclass_vncorenlp = autoclass('vn.pipeline.VnCoreNLP')
self.javaclass_String = autoclass('java.lang.String')
self.annotators = annotators
if "wseg" not in annotators:
self.annotators.append("wseg")
self.model = javaclass_vncorenlp(annotators)
def annotate_text(self, text):
from jnius import autoclass
javaclass_Annotation = autoclass('vn.pipeline.Annotation')
str = self.javaclass_String(text)
annotation = javaclass_Annotation(str)
self.model.annotate(annotation)
dict_sentences = {}
list_sentences = annotation.toString().split("\n\n")[:-1]
for i in range(len(list_sentences)):
list_words = list_sentences[i].split("\n")
list_dict_words = []
for word in list_words:
dict_word = {}
word = word.replace("\t\t", "\t")
list_tags = word.split("\t")
dict_word["index"] = int(list_tags[0])
dict_word["wordForm"] = list_tags[1]
dict_word["posTag"] = list_tags[2]
dict_word["nerLabel"] = list_tags[3]
if "parse" in self.annotators:
dict_word["head"] = int(list_tags[4])
else:
dict_word["head"] = list_tags[4]
dict_word["depLabel"] = list_tags[5]
list_dict_words.append(dict_word)
dict_sentences[i] = list_dict_words
return dict_sentences
def tokenize(self, text):
annotated_sens = self.annotate_text(text=text)
output = []
for id_sen in annotated_sens:
annotated_sen = annotated_sens[id_sen]
out = [s["wordForm"] for s in annotated_sen]
output.append(out)
return output
def pos_tag(self, text):
annotated_sens = self.annotate_text(text=text)
output = []
for id_sen in annotated_sens:
annotated_sen = annotated_sens[id_sen]
out = [(s["wordForm"], s["posTag"]) for s in annotated_sen]
output.append(out)
return output
def ner(self, text):
annotated_sens = self.annotate_text(text=text)
output = []
for id_sen in annotated_sens:
annotated_sen = annotated_sens[id_sen]
out = [(s["wordForm"], s["nerLabel"]) for s in annotated_sen]
output.append(out)
return output
def word_segment(self, text):
from jnius import autoclass
javaclass_Annotation = autoclass('vn.pipeline.Annotation')
str = self.javaclass_String(text)
annotation = javaclass_Annotation(str)
self.model.annotate(annotation)
list_segmented_sentences = []
list_sentences = annotation.toString().split("\n\n")[:-1]
for sent in list_sentences:
list_words = sent.split("\n")
list_segmented_words = []
for word in list_words:
word = word.replace("\t\t", "\t")
list_tags = word.split("\t")
list_segmented_words.append(list_tags[1])
list_segmented_sentences.append(" ".join(list_segmented_words))
return list_segmented_sentences
def print_out(self, dict_sentences):
for sent in dict_sentences.keys():
list_dict_words = dict_sentences[sent]
for word in list_dict_words:
print(str(word["index"]) + "\t" + word["wordForm"] + "\t" + word["posTag"] + "\t" + word["nerLabel"] + "\t" + str(word["head"]) + "\t" + word["depLabel"])
print("")
def annotate_file(self, input_file, output_file):
os.chdir(self.current_working_dir)
input_str = self.javaclass_String(input_file)
output_str = self.javaclass_String(output_file)
self.model.processPipeline(input_str, output_str, self.annotators)
if __name__ == '__main__':
download_model(save_dir='/home2/vietle/icgpt/vncorenlp-1.2')
model = VnCoreNLP(annotators=["wseg","pos","ner"], save_dir='/home2/vietle/icgpt/vncorenlp-1.2')
# output = model.annotate_text("Ông Nguyễn Khắc Chúc đang làm việc tại Đại học Quốc gia Hà Nội. Bà Lan, vợ ông Chúc, cũng làm việc tại đây.")
# print(output)
text = "Sau khi tốt nghiệp Trung học năm 1975, ông theo học dự bị Ngoại ngữ tại Đại học Ngoại ngữ (nay là Trường Đại học Hà Nội)."
out = model.tokenize(text)
print(out)
# model.print_out(output)