Spaces:

ando55
/

clinical_segment_splitter

Runtime error

kenichiro

commit

46a030d over 2 years ago

20.4 kB

	import gensim
	import MeCab
	import pickle
	from gensim.models.wrappers.fasttext import FastText
	#import fasttext as ft
	import random
	import mojimoji
	import numpy as np
	from tqdm import tqdm

	def ymyi(lis):
	wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")

	with open('fm_space.pickle', 'rb') as f:
	fm = pickle.load(f)
	#model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
	model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
	texts = []
	sent = ""
	sparate = []
	label = []
	ruiseki = 0
	ruiseki2 = 0
	alls = []
	labels, text, num = [], [], []
	for n, line in enumerate(open(lis)):
	line = line.strip("\t").rstrip("\n")
	#print(line)
	if line == "":
	if sent == "":
	continue
	sent = wakati.parse(sent).split(" ")[:-1]
	flag = 0
	for i in sent:
	for j in sparate:
	if ruiseki+len(i) > j and ruiseki < j:
	label.append(1)
	flag = 1
	elif ruiseki+len(i) == j:
	label.append(1)
	flag = 1
	if flag == 0:
	label.append(0)
	flag = 0
	ruiseki += len(i)
	#texts += i + " "
	try:
	texts.append(model[i])
	#texts.append(np.array(fm.vocab[i]))
	#texts += str(fm.vocab[i].index) + " "
	#print(i,str(fm.vocab[i].index))
	except KeyError:
	texts.append(fm["<unk>"])
	label[-1] = 1
	#texts = texts.rstrip() + "\t"
	#texts += " ".join(label) + "\n"
	#alls.append((n,texts,label))
	labels.append(label)
	text.append(texts)
	num.append(n)
	sent = ""
	sparate = []
	texts = []
	label = []
	ruiseki = 0
	ruiseki2 = 0
	continue
	sent += mojimoji.han_to_zen(line, digit=False, ascii=False)
	ruiseki2 += len(line)
	sparate.append(ruiseki2)
	return num,text,labels

	def nmni(lis):
	#wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
	wakati = MeCab.Tagger("-Owakati -b 81920")

	with open('fm_space.pickle', 'rb') as f:
	fm = pickle.load(f)
	#model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
	#model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
	texts = []
	sent = ""
	sparate = []
	label = []
	ruiseki = 0
	ruiseki2 = 0
	alls = []
	labels, text, num = [], [], []
	for n, line in enumerate(open(lis)):
	line = line.strip("\t").rstrip("\n")
	#print(line)
	if line == "":
	if sent == "":
	continue
	sent = wakati.parse(sent).split(" ")[:-1]
	flag = 0
	for i in sent:
	for j in sparate:
	if ruiseki+len(i) > j and ruiseki < j:
	label.append(1)
	flag = 1
	elif ruiseki+len(i) == j:
	label.append(1)
	flag = 1
	if flag == 0:
	label.append(0)
	flag = 0
	ruiseki += len(i)
	#texts += i + " "
	try:
	#texts.append(model[i])
	texts.append(fm[i])
	#texts += str(fm.vocab[i].index) + " "
	#print(i,str(fm.vocab[i].index))
	except KeyError:
	texts.append(fm["<unk>"])
	label[-1] = 1
	#texts = texts.rstrip() + "\t"
	#texts += " ".join(label) + "\n"
	#alls.append((n,texts,label))
	labels.append(label)
	text.append(texts)
	num.append(n)
	sent = ""
	sparate = []
	texts = []
	label = []
	ruiseki = 0
	ruiseki2 = 0
	continue
	sent += mojimoji.han_to_zen(line, digit=False, ascii=False)
	ruiseki2 += len(line)
	sparate.append(ruiseki2)
	return num,text,labels

	def nmni_finetune(lis):
	#wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
	wakati = MeCab.Tagger("-Owakati -b 81920")
	#fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False)
	with open('fm.pickle', 'rb') as f:
	fm = pickle.load(f)
	#fm = gensim.models.KeyedVectors.load_word2vec_format('cc.ja.300.vec', binary=False)
	#with open('fm.pickle', 'wb') as f:
	# pickle.dump(fm, f)
	#model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
	#model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
	texts = []
	sent = ""
	sparate = []
	label = []
	ruiseki = 0
	ruiseki2 = 0
	alls = []
	labels, text, num = [], [], []
	for n, line in enumerate(open(lis)):
	line = line.strip("\t").rstrip("\n")
	#print(line)
	if line == "":
	if sent == "":
	continue
	sent = wakati.parse(sent).split(" ")[:-1]
	flag = 0
	for i in sent:
	for j in sparate:
	if ruiseki+len(i) > j and ruiseki < j:
	label.append(1)
	flag = 1
	elif ruiseki+len(i) == j:
	label.append(1)
	flag = 1
	if flag == 0:
	label.append(0)
	flag = 0
	ruiseki += len(i)
	#texts += i + " "
	try:
	#texts.append(model[i])
	#texts.append(fm[i])
	texts.append(fm.vocab[i].index)
	#print(i,str(fm.vocab[i].index))
	except KeyError:
	texts.append(fm.vocab["<unk>"].index)
	label[-1] = 1
	#texts = texts.rstrip() + "\t"
	#texts += " ".join(label) + "\n"
	#alls.append((n,texts,label))
	labels.append(np.array(label))
	text.append(np.array(texts))
	num.append(n)
	sent = ""
	sparate = []
	texts = []
	label = []
	ruiseki = 0
	ruiseki2 = 0
	continue
	sent += mojimoji.han_to_zen(line, digit=False, ascii=False)
	ruiseki2 += len(line)
	sparate.append(ruiseki2)
	return text,labels



	def nmni_carte(lis):
	#wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
	wakati = MeCab.Tagger("-Owakati -b 81920")
	#fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False)
	#fm = gensim.models.KeyedVectors.load_word2vec_format('cc.ja.300.vec', binary=False)
	#with open('fm.pickle', 'wb') as f:
	# pickle.dump(fm, f)
	#model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
	#model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
	with open('fm.pickle', 'rb') as f:
	fm = pickle.load(f)
	texts = []
	sent = ""
	sparate = []
	label = []
	ruiseki = 0
	ruiseki2 = 0
	alls = []
	labels, text, num = [], [], []
	allab, altex, fukugenss = [], [], []
	#for n in tqdm(range(26431)):
	for n in tqdm(range(108)):
	fukugens = []
	for line in open(lis+str(n)+".txt"):
	line = line.strip()
	if line == "":
	continue
	sent = wakati.parse(line).split(" ")[:-1]
	flag = 0
	label = []
	texts = []
	fukugen = []
	for i in sent:
	try:
	texts.append(fm.vocab[i].index)
	except KeyError:
	texts.append(fm.vocab["<unk>"].index)
	fukugen.append(i)
	label.append(0)
	label[-1] = 1
	labels.append(np.array(label))
	text.append(np.array(texts))
	#labels.append(label)
	#text.append(texts)
	fukugens.append(fukugen)
	allab.append(labels)
	altex.append(text)
	fukugenss.append(fukugens)
	labels, text, fukugens= [], [], []
	return altex, allab, fukugenss


	def nmni_finetune_s(lis):
	#wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
	wakati = MeCab.Tagger("-Owakati -b 81920")
	#fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False)
	fm = gensim.models.KeyedVectors.load_word2vec_format('cc.ja.300.vec', binary=False)
	with open('fm.pickle', 'wb') as f:
	pickle.dump(fm, f)
	#model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
	#model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
	texts = []
	sent = ""
	sparate = []
	label = []
	ruiseki = 0
	ruiseki2 = 0
	alls = []
	labels, text, num = [], [], []
	for n, line in enumerate(open(lis)):
	line = line.strip("\t").rstrip("\n")
	sent = wakati.parse(line).split(" ")[:-1]
	flag = 0
	label = []
	texts = []
	for i in sent:
	try:
	texts.append(fm.vocab[i].index)
	except KeyError:
	texts.append(fm.vocab["<unk>"].index)
	label.append(0)
	label[-1] = 1
	labels.append(np.array(label))
	text.append(np.array(texts))
	return text,labels


	def nmni_finetune_ss(lis):
	#wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
	wakati = MeCab.Tagger("-Owakati -b 81920")
	fm = gensim.models.KeyedVectors.load_word2vec_format('cc.ja.300.vec', binary=False)
	with open('fm.pickle', 'wb') as f:
	pickle.dump(fm, f)
	#with open('fm.pickle', 'rb') as f:
	# fm = pickle.load(f)
	#model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
	#model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
	t,l =[],[]
	for i in range(108):
	texts = []
	sent = ""
	sparate = []
	label = []
	ruiseki = 0
	ruiseki2 = 0
	alls = []
	labels, text, num = [], [], []
	for n, line in enumerate(open(lis+str(i)+".txt")):
	line = line.strip("\t").rstrip("\n")
	if line == "":
	continue
	sent = wakati.parse(line).split(" ")[:-1]
	flag = 0
	label = []
	texts = []
	for i in sent:
	try:
	texts.append(fm.vocab[i].index)
	except KeyError:
	texts.append(fm.vocab["<unk>"].index)
	label.append(0)
	label[-1] = 1
	labels.append(np.array(label))
	text.append(np.array(texts))
	t.append(text)
	l.append(labels)
	return t,l

	#model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
	#print(model.get_subwords("間質性肺炎"))
	#print(model.get_subwords("誤嚥性肺炎"))
	#print(model.get_subwords("談話ユニット分割"))

	"""
	texts = []
	sent = ""
	sparate = []
	label = []
	ruiseki = 0
	ruiseki2 = 0
	alls = []
	for n, line in enumerate(open("/clwork/ando/SEGBOT/randomdata.tsv")):
	line = line.strip("\t").rstrip("\n")
	if line == "":
	if sent == "":
	continue
	alls.append(sent)
	sent = ""
	continue
	else:
	sent += line
	if len(sent) != 0:
	alls.append(sent)
	random.shuffle(alls)
	#v = random.sample(alls, 300)
	#for i in v:
	# alls.remove(i)
	#t = random.sample(alls, 300)
	#for i in t:
	# alls.remove(i)
	with open("randomdata_concat.tsv","a")as f:
	f.write("\n".join())
	#with open("dev_fix.tsv","a")as f:
	# for i in v:
	# f.write("\n".join(i))
	# f.write("\n\n")
	#with open("test_fix.tsv","a")as f:
	# for i in t:
	# f.write("\n".join(i))
	# f.write("\n\n")
	"""

	"""
	out = ""
	for line in open("/clwork/ando/SEGBOT_BERT/alldata2_bert.tsv"):
	line = line.split("\t")
	line = line[0].strip()
	if line == "" or "サマリ" in line:
	continue
	out += line + "\n"
	with open("alldata3.tsv","w")as f:
	f.write(out)
	"""
	"""
	#wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
	wakati = MeCab.Tagger("-Owakati -b 81920")

	with open('fm_space.pickle', 'rb') as f:
	fm = pickle.load(f)
	#model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
	#model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
	texts = []
	sent = ""
	sparate = []
	label = []
	ruiseki = 0
	ruiseki2 = 0
	alls = []
	for n, line in enumerate(open("/clwork/ando/SEGBOT/train_fix.tsv")):
	line = line.strip("\t").rstrip("\n")
	#print(line)
	if line == "":
	if sent == "":
	continue
	sent = wakati.parse(sent).split(" ")[:-1]
	flag = 0
	for i in sent:
	for j in sparate:
	if ruiseki+len(i) > j and ruiseki < j:
	label.append(1)
	flag = 1
	elif ruiseki+len(i) == j:
	label.append(1)
	flag = 1
	if flag == 0:
	label.append(0)
	flag = 0
	ruiseki += len(i)
	#texts += i + " "
	try:
	#texts.append(model[i])
	texts.append(fm.vocab[i])
	#texts += str(fm.vocab[i].index) + " "
	#print(i,str(fm.vocab[i].index))
	except KeyError:
	texts.append(fm.vocab["<unk>"])
	print(i)
	label[-1] = 1
	#texts = texts.rstrip() + "\t"
	#texts += " ".join(label) + "\n"
	alls.append((str(n),texts,label))
	sent = ""
	sparate = []
	texts = []
	label = []
	ruiseki = 0
	ruiseki2 = 0
	continue
	sent += mojimoji.han_to_zen(line, digit=False, ascii=False)
	ruiseki2 += len(line)
	sparate.append(ruiseki2)
	with open('nm_ni/train.pickle', 'wb') as f:
	pickle.dump(alls, f)
	#print(alls)
	#with open("resepdata_seped.tsv","w")as f:
	# f.write(texts)
	"""



	wakati = MeCab.Tagger("-Owakati")

	#fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False)
	#with open('fm.pickle', 'wb') as f:
	# pickle.dump(fm, f)
	texts = ""
	sent = ""
	sparate = []
	label = []
	ruiseki = 0
	ruiseki2 = 0
	for line in open("alldata.tsv"):
	line = line.split("\t")
	line = line[0].strip()
	if line == "" or "サマリ" in line:
	if sent == "":
	continue
	sent = wakati.parse(sent).split(" ")[:-1]
	flag = 0
	#print(sent,sparate)
	for i in sent:
	#print(i)
	for j in sparate:
	if ruiseki+len(i) > j and ruiseki < j:
	#print(j)
	label.append("1")
	flag = 1
	elif ruiseki+len(i) == j:
	#print(j)
	label.append("1")
	flag = 1
	if flag == 0:
	label.append("0")
	flag = 0
	ruiseki += len(i)
	#texts += i + " "

	try:
	texts += str(0) + " "
	except KeyError:
	print(i)
	#texts += str(fm.vocab["<unk>"].index) + " "

	label[-1] = "1"
	texts = texts.rstrip() + "\t"
	texts += " ".join(label) + "\n"
	sent = ""
	sparate = []
	label = []
	ruiseki = 0
	ruiseki2 = 0
	#print(texts)
	continue
	sent += line.strip()
	ruiseki2 += len(line.strip())
	sparate.append(ruiseki2)
	with open("random_labbeled.tsv","w")as f:
	f.write(texts)





	"""
	wakati = MeCab.Tagger("-Owakati -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")


	#fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300_space.vec', binary=False)
	#with open('fm_space.pickle', 'wb') as f:
	# pickle.dump(fm, f)

	with open('fm_space.pickle', 'rb') as f:
	fm = pickle.load(f)
	texts = ""
	sent = ""
	sparate = []
	label = []
	ruiseki = 0
	ruiseki2 = 0
	for line in open("/clwork/ando/SEGBOT/alldata_resep.tsv"):
	line = line.split("\t")
	line = line[0].strip("\t").rstrip("\n")
	#print(line)
	if line == "" or "サマリ" in line:
	if sent == "":
	continue
	print(sent)
	sent = sent.replace(" ","<space>")
	sent = wakati.parse(sent).split(" ")[:-1]
	print(sent)
	flag = 0
	#print(sent,sparate)
	for i in sent:
	#print(i)
	for j in sparate:
	if ruiseki+len(i) > j and ruiseki < j:
	#print(j)
	label.append("1")
	flag = 1
	elif ruiseki+len(i) == j:
	#print(j)
	label.append("1")
	flag = 1
	if flag == 0:
	label.append("0")
	flag = 0
	ruiseki += len(i)
	#texts += i + " "

	try:
	texts += str(fm.vocab[i].index) + " "
	#print(i,str(fm.vocab[i].index))
	except KeyError:
	texts += str(fm.vocab["<unk>"].index) + " "
	label[-1] = "1"
	texts = texts.rstrip() + "\t"
	texts += " ".join(label) + "\n"
	sent = ""
	sparate = []
	label = []
	ruiseki = 0
	ruiseki2 = 0
	#print(texts)
	continue
	sent += line.strip("\t")
	ruiseki2 += len(line)
	sparate.append(ruiseki2)
	with open("alldata2_space.tsv","w")as f:
	f.write(texts)
	"""



	"""
	wakati = MeCab.Tagger("-Owakati")

	fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False)
	texts = ""
	sent = ""
	cand = ""
	sparate = []
	label = []
	ruiseki = 0
	ruiseki2 = 0
	flag2 = 1
	for line in open("data2.tsv"):
	line = line.split("\t")
	if flag2 == 1:
	cand = line
	flag2 = 2
	continue
	if flag2 == 2:
	flag2 = 1
	#print(line,cand)
	for n,z in enumerate(zip(cand,line)):
	i = z[0]
	j = z[1]
	n = n+1
	if i == "":
	sent = wakati.parse(sent).split(" ")[:-1]
	flag = 0
	#print(sent,sparate)
	for i in sent:
	#print(i)
	for j in sparate:
	if ruiseki+len(i) > j and ruiseki < j:
	#print(j)
	label.append("1")
	flag = 1
	elif ruiseki+len(i) == j:
	#print(j)
	label.append("1")
	flag = 1
	if flag == 0:
	label.append("0")
	flag = 0
	ruiseki += len(i)
	#texts += i + " "

	try:
	texts += str(fm.vocab[i].index) + " "
	except KeyError:
	texts += str(fm.vocab["<unk>"].index) + " "

	label[-1] = "1"
	texts = texts.rstrip() + "\t"
	texts += " ".join(label) + "\n"
	sent = ""
	sparate = []
	label = []
	ruiseki = 0
	ruiseki2 = 0
	#print(texts)
	break
	if j == "\|":
	sparate.append(n)
	sent += i
	with open("alldata.tsv","w")as f:
	f.write(texts)
	"""