Spaces:

cuongnguyen910
/

topic-clustering-global-dashboard

Build error

App Files Files Community

topic-clustering-global-dashboard / function /embed_vncorenlp.py

cuongnguyen910

Upload folder using huggingface_hub

5120311 verified 4 months ago

raw

history blame contribute delete

7.71 kB

	import jnius_config
	import os
	import shutil

	save_dir = "/home2/vietle/icgpt/vncorenlp-1.2"
	max_heap_size='-Xmx4g'
	jnius_config.add_options(max_heap_size)
	jnius_config.set_classpath(save_dir + "/VnCoreNLP-1.2.jar")

	def download_model(save_dir='./'):
	# current_path = os.path.abspath(os.getcwd())
	if save_dir[-1] == '/':
	save_dir = save_dir[:-1]
	if os.path.isdir(save_dir + "/models") and os.path.exists(save_dir + '/VnCoreNLP-1.2.jar'):
	print("VnCoreNLP model folder " + save_dir + " already exists! Please load VnCoreNLP from this folder!")
	else:
	os.mkdir(save_dir + "/models")
	os.mkdir(save_dir + "/models/dep")
	os.mkdir(save_dir + "/models/ner")
	os.mkdir(save_dir + "/models/postagger")
	os.mkdir(save_dir + "/models/wordsegmenter")
	# jar
	os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.2.jar")
	shutil.move("VnCoreNLP-1.2.jar", save_dir + "/VnCoreNLP-1.2.jar")
	# wordsegmenter
	os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab")
	os.system(
	"wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/wordsegmenter.rdr")
	shutil.move("vi-vocab", save_dir + "/models/wordsegmenter/vi-vocab")
	shutil.move("wordsegmenter.rdr", save_dir + "/models/wordsegmenter/wordsegmenter.rdr")
	# postagger
	os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/postagger/vi-tagger")
	shutil.move("vi-tagger", save_dir + "/models/postagger/vi-tagger")
	# ner
	os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/ner/vi-500brownclusters.xz")
	os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/ner/vi-ner.xz")
	os.system(
	"wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/ner/vi-pretrainedembeddings.xz")
	shutil.move("vi-500brownclusters.xz", save_dir + "/models/ner/vi-500brownclusters.xz")
	shutil.move("vi-ner.xz", save_dir + "/models/ner/vi-ner.xz")
	shutil.move("vi-pretrainedembeddings.xz", save_dir + "/models/ner/vi-pretrainedembeddings.xz")
	# parse
	os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/dep/vi-dep.xz")
	shutil.move("vi-dep.xz", save_dir + "/models/dep/vi-dep.xz")


	class VnCoreNLP:
	def __init__(self, annotators=["wseg", "pos", "ner", "parse"], save_dir = './'):
	if save_dir[-1] == '/':
	save_dir = save_dir[:-1]
	if os.path.isdir(save_dir + "/models") == False or os.path.exists(save_dir + '/VnCoreNLP-1.2.jar') == False:
	raise Exception("Please download the VnCoreNLP model!")
	self.current_working_dir = os.getcwd()
	os.chdir(save_dir)

	from jnius import autoclass
	javaclass_vncorenlp = autoclass('vn.pipeline.VnCoreNLP')
	self.javaclass_String = autoclass('java.lang.String')
	self.annotators = annotators
	if "wseg" not in annotators:
	self.annotators.append("wseg")

	self.model = javaclass_vncorenlp(annotators)

	def annotate_text(self, text):
	from jnius import autoclass
	javaclass_Annotation = autoclass('vn.pipeline.Annotation')
	str = self.javaclass_String(text)
	annotation = javaclass_Annotation(str)
	self.model.annotate(annotation)
	dict_sentences = {}
	list_sentences = annotation.toString().split("\n\n")[:-1]
	for i in range(len(list_sentences)):
	list_words = list_sentences[i].split("\n")
	list_dict_words = []
	for word in list_words:
	dict_word = {}
	word = word.replace("\t\t", "\t")
	list_tags = word.split("\t")
	dict_word["index"] = int(list_tags[0])
	dict_word["wordForm"] = list_tags[1]
	dict_word["posTag"] = list_tags[2]
	dict_word["nerLabel"] = list_tags[3]
	if "parse" in self.annotators:
	dict_word["head"] = int(list_tags[4])
	else:
	dict_word["head"] = list_tags[4]
	dict_word["depLabel"] = list_tags[5]
	list_dict_words.append(dict_word)
	dict_sentences[i] = list_dict_words
	return dict_sentences

	def tokenize(self, text):
	annotated_sens = self.annotate_text(text=text)
	output = []
	for id_sen in annotated_sens:
	annotated_sen = annotated_sens[id_sen]
	out = [s["wordForm"] for s in annotated_sen]
	output.append(out)
	return output

	def pos_tag(self, text):
	annotated_sens = self.annotate_text(text=text)
	output = []
	for id_sen in annotated_sens:
	annotated_sen = annotated_sens[id_sen]
	out = [(s["wordForm"], s["posTag"]) for s in annotated_sen]
	output.append(out)
	return output

	def ner(self, text):
	annotated_sens = self.annotate_text(text=text)
	output = []
	for id_sen in annotated_sens:
	annotated_sen = annotated_sens[id_sen]
	out = [(s["wordForm"], s["nerLabel"]) for s in annotated_sen]
	output.append(out)
	return output

	def word_segment(self, text):
	from jnius import autoclass
	javaclass_Annotation = autoclass('vn.pipeline.Annotation')
	str = self.javaclass_String(text)
	annotation = javaclass_Annotation(str)
	self.model.annotate(annotation)
	list_segmented_sentences = []
	list_sentences = annotation.toString().split("\n\n")[:-1]
	for sent in list_sentences:
	list_words = sent.split("\n")
	list_segmented_words = []
	for word in list_words:
	word = word.replace("\t\t", "\t")
	list_tags = word.split("\t")
	list_segmented_words.append(list_tags[1])
	list_segmented_sentences.append(" ".join(list_segmented_words))
	return list_segmented_sentences

	def print_out(self, dict_sentences):
	for sent in dict_sentences.keys():
	list_dict_words = dict_sentences[sent]
	for word in list_dict_words:
	print(str(word["index"]) + "\t" + word["wordForm"] + "\t" + word["posTag"] + "\t" + word["nerLabel"] + "\t" + str(word["head"]) + "\t" + word["depLabel"])
	print("")

	def annotate_file(self, input_file, output_file):
	os.chdir(self.current_working_dir)
	input_str = self.javaclass_String(input_file)
	output_str = self.javaclass_String(output_file)
	self.model.processPipeline(input_str, output_str, self.annotators)

	if __name__ == '__main__':
	download_model(save_dir='/home2/vietle/icgpt/vncorenlp-1.2')
	model = VnCoreNLP(annotators=["wseg","pos","ner"], save_dir='/home2/vietle/icgpt/vncorenlp-1.2')
	# output = model.annotate_text("Ông Nguyễn Khắc Chúc đang làm việc tại Đại học Quốc gia Hà Nội. Bà Lan, vợ ông Chúc, cũng làm việc tại đây.")
	# print(output)

	text = "Sau khi tốt nghiệp Trung học năm 1975, ông theo học dự bị Ngoại ngữ tại Đại học Ngoại ngữ (nay là Trường Đại học Hà Nội)."
	out = model.tokenize(text)
	print(out)
	# model.print_out(output)