kenichiro
commit
46a030d
raw
history blame
20.4 kB
import gensim
import MeCab
import pickle
from gensim.models.wrappers.fasttext import FastText
#import fasttext as ft
import random
import mojimoji
import numpy as np
from tqdm import tqdm
def ymyi(lis):
wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
with open('fm_space.pickle', 'rb') as f:
fm = pickle.load(f)
#model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
texts = []
sent = ""
sparate = []
label = []
ruiseki = 0
ruiseki2 = 0
alls = []
labels, text, num = [], [], []
for n, line in enumerate(open(lis)):
line = line.strip("\t").rstrip("\n")
#print(line)
if line == "":
if sent == "":
continue
sent = wakati.parse(sent).split(" ")[:-1]
flag = 0
for i in sent:
for j in sparate:
if ruiseki+len(i) > j and ruiseki < j:
label.append(1)
flag = 1
elif ruiseki+len(i) == j:
label.append(1)
flag = 1
if flag == 0:
label.append(0)
flag = 0
ruiseki += len(i)
#texts += i + " "
try:
texts.append(model[i])
#texts.append(np.array(fm.vocab[i]))
#texts += str(fm.vocab[i].index) + " "
#print(i,str(fm.vocab[i].index))
except KeyError:
texts.append(fm["<unk>"])
label[-1] = 1
#texts = texts.rstrip() + "\t"
#texts += " ".join(label) + "\n"
#alls.append((n,texts,label))
labels.append(label)
text.append(texts)
num.append(n)
sent = ""
sparate = []
texts = []
label = []
ruiseki = 0
ruiseki2 = 0
continue
sent += mojimoji.han_to_zen(line, digit=False, ascii=False)
ruiseki2 += len(line)
sparate.append(ruiseki2)
return num,text,labels
def nmni(lis):
#wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
wakati = MeCab.Tagger("-Owakati -b 81920")
with open('fm_space.pickle', 'rb') as f:
fm = pickle.load(f)
#model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
#model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
texts = []
sent = ""
sparate = []
label = []
ruiseki = 0
ruiseki2 = 0
alls = []
labels, text, num = [], [], []
for n, line in enumerate(open(lis)):
line = line.strip("\t").rstrip("\n")
#print(line)
if line == "":
if sent == "":
continue
sent = wakati.parse(sent).split(" ")[:-1]
flag = 0
for i in sent:
for j in sparate:
if ruiseki+len(i) > j and ruiseki < j:
label.append(1)
flag = 1
elif ruiseki+len(i) == j:
label.append(1)
flag = 1
if flag == 0:
label.append(0)
flag = 0
ruiseki += len(i)
#texts += i + " "
try:
#texts.append(model[i])
texts.append(fm[i])
#texts += str(fm.vocab[i].index) + " "
#print(i,str(fm.vocab[i].index))
except KeyError:
texts.append(fm["<unk>"])
label[-1] = 1
#texts = texts.rstrip() + "\t"
#texts += " ".join(label) + "\n"
#alls.append((n,texts,label))
labels.append(label)
text.append(texts)
num.append(n)
sent = ""
sparate = []
texts = []
label = []
ruiseki = 0
ruiseki2 = 0
continue
sent += mojimoji.han_to_zen(line, digit=False, ascii=False)
ruiseki2 += len(line)
sparate.append(ruiseki2)
return num,text,labels
def nmni_finetune(lis):
#wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
wakati = MeCab.Tagger("-Owakati -b 81920")
#fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False)
with open('fm.pickle', 'rb') as f:
fm = pickle.load(f)
#fm = gensim.models.KeyedVectors.load_word2vec_format('cc.ja.300.vec', binary=False)
#with open('fm.pickle', 'wb') as f:
# pickle.dump(fm, f)
#model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
#model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
texts = []
sent = ""
sparate = []
label = []
ruiseki = 0
ruiseki2 = 0
alls = []
labels, text, num = [], [], []
for n, line in enumerate(open(lis)):
line = line.strip("\t").rstrip("\n")
#print(line)
if line == "":
if sent == "":
continue
sent = wakati.parse(sent).split(" ")[:-1]
flag = 0
for i in sent:
for j in sparate:
if ruiseki+len(i) > j and ruiseki < j:
label.append(1)
flag = 1
elif ruiseki+len(i) == j:
label.append(1)
flag = 1
if flag == 0:
label.append(0)
flag = 0
ruiseki += len(i)
#texts += i + " "
try:
#texts.append(model[i])
#texts.append(fm[i])
texts.append(fm.vocab[i].index)
#print(i,str(fm.vocab[i].index))
except KeyError:
texts.append(fm.vocab["<unk>"].index)
label[-1] = 1
#texts = texts.rstrip() + "\t"
#texts += " ".join(label) + "\n"
#alls.append((n,texts,label))
labels.append(np.array(label))
text.append(np.array(texts))
num.append(n)
sent = ""
sparate = []
texts = []
label = []
ruiseki = 0
ruiseki2 = 0
continue
sent += mojimoji.han_to_zen(line, digit=False, ascii=False)
ruiseki2 += len(line)
sparate.append(ruiseki2)
return text,labels
def nmni_carte(lis):
#wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
wakati = MeCab.Tagger("-Owakati -b 81920")
#fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False)
#fm = gensim.models.KeyedVectors.load_word2vec_format('cc.ja.300.vec', binary=False)
#with open('fm.pickle', 'wb') as f:
# pickle.dump(fm, f)
#model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
#model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
with open('fm.pickle', 'rb') as f:
fm = pickle.load(f)
texts = []
sent = ""
sparate = []
label = []
ruiseki = 0
ruiseki2 = 0
alls = []
labels, text, num = [], [], []
allab, altex, fukugenss = [], [], []
#for n in tqdm(range(26431)):
for n in tqdm(range(108)):
fukugens = []
for line in open(lis+str(n)+".txt"):
line = line.strip()
if line == "":
continue
sent = wakati.parse(line).split(" ")[:-1]
flag = 0
label = []
texts = []
fukugen = []
for i in sent:
try:
texts.append(fm.vocab[i].index)
except KeyError:
texts.append(fm.vocab["<unk>"].index)
fukugen.append(i)
label.append(0)
label[-1] = 1
labels.append(np.array(label))
text.append(np.array(texts))
#labels.append(label)
#text.append(texts)
fukugens.append(fukugen)
allab.append(labels)
altex.append(text)
fukugenss.append(fukugens)
labels, text, fukugens= [], [], []
return altex, allab, fukugenss
def nmni_finetune_s(lis):
#wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
wakati = MeCab.Tagger("-Owakati -b 81920")
#fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False)
fm = gensim.models.KeyedVectors.load_word2vec_format('cc.ja.300.vec', binary=False)
with open('fm.pickle', 'wb') as f:
pickle.dump(fm, f)
#model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
#model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
texts = []
sent = ""
sparate = []
label = []
ruiseki = 0
ruiseki2 = 0
alls = []
labels, text, num = [], [], []
for n, line in enumerate(open(lis)):
line = line.strip("\t").rstrip("\n")
sent = wakati.parse(line).split(" ")[:-1]
flag = 0
label = []
texts = []
for i in sent:
try:
texts.append(fm.vocab[i].index)
except KeyError:
texts.append(fm.vocab["<unk>"].index)
label.append(0)
label[-1] = 1
labels.append(np.array(label))
text.append(np.array(texts))
return text,labels
def nmni_finetune_ss(lis):
#wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
wakati = MeCab.Tagger("-Owakati -b 81920")
fm = gensim.models.KeyedVectors.load_word2vec_format('cc.ja.300.vec', binary=False)
with open('fm.pickle', 'wb') as f:
pickle.dump(fm, f)
#with open('fm.pickle', 'rb') as f:
# fm = pickle.load(f)
#model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
#model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
t,l =[],[]
for i in range(108):
texts = []
sent = ""
sparate = []
label = []
ruiseki = 0
ruiseki2 = 0
alls = []
labels, text, num = [], [], []
for n, line in enumerate(open(lis+str(i)+".txt")):
line = line.strip("\t").rstrip("\n")
if line == "":
continue
sent = wakati.parse(line).split(" ")[:-1]
flag = 0
label = []
texts = []
for i in sent:
try:
texts.append(fm.vocab[i].index)
except KeyError:
texts.append(fm.vocab["<unk>"].index)
label.append(0)
label[-1] = 1
labels.append(np.array(label))
text.append(np.array(texts))
t.append(text)
l.append(labels)
return t,l
#model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
#print(model.get_subwords("間質性肺炎"))
#print(model.get_subwords("誤嚥性肺炎"))
#print(model.get_subwords("談話ユニット分割"))
"""
texts = []
sent = ""
sparate = []
label = []
ruiseki = 0
ruiseki2 = 0
alls = []
for n, line in enumerate(open("/clwork/ando/SEGBOT/randomdata.tsv")):
line = line.strip("\t").rstrip("\n")
if line == "":
if sent == "":
continue
alls.append(sent)
sent = ""
continue
else:
sent += line
if len(sent) != 0:
alls.append(sent)
random.shuffle(alls)
#v = random.sample(alls, 300)
#for i in v:
# alls.remove(i)
#t = random.sample(alls, 300)
#for i in t:
# alls.remove(i)
with open("randomdata_concat.tsv","a")as f:
f.write("\n".join())
#with open("dev_fix.tsv","a")as f:
# for i in v:
# f.write("\n".join(i))
# f.write("\n\n")
#with open("test_fix.tsv","a")as f:
# for i in t:
# f.write("\n".join(i))
# f.write("\n\n")
"""
"""
out = ""
for line in open("/clwork/ando/SEGBOT_BERT/alldata2_bert.tsv"):
line = line.split("\t")
line = line[0].strip()
if line == "" or "サマリ" in line:
continue
out += line + "\n"
with open("alldata3.tsv","w")as f:
f.write(out)
"""
"""
#wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
wakati = MeCab.Tagger("-Owakati -b 81920")
with open('fm_space.pickle', 'rb') as f:
fm = pickle.load(f)
#model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
#model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
texts = []
sent = ""
sparate = []
label = []
ruiseki = 0
ruiseki2 = 0
alls = []
for n, line in enumerate(open("/clwork/ando/SEGBOT/train_fix.tsv")):
line = line.strip("\t").rstrip("\n")
#print(line)
if line == "":
if sent == "":
continue
sent = wakati.parse(sent).split(" ")[:-1]
flag = 0
for i in sent:
for j in sparate:
if ruiseki+len(i) > j and ruiseki < j:
label.append(1)
flag = 1
elif ruiseki+len(i) == j:
label.append(1)
flag = 1
if flag == 0:
label.append(0)
flag = 0
ruiseki += len(i)
#texts += i + " "
try:
#texts.append(model[i])
texts.append(fm.vocab[i])
#texts += str(fm.vocab[i].index) + " "
#print(i,str(fm.vocab[i].index))
except KeyError:
texts.append(fm.vocab["<unk>"])
print(i)
label[-1] = 1
#texts = texts.rstrip() + "\t"
#texts += " ".join(label) + "\n"
alls.append((str(n),texts,label))
sent = ""
sparate = []
texts = []
label = []
ruiseki = 0
ruiseki2 = 0
continue
sent += mojimoji.han_to_zen(line, digit=False, ascii=False)
ruiseki2 += len(line)
sparate.append(ruiseki2)
with open('nm_ni/train.pickle', 'wb') as f:
pickle.dump(alls, f)
#print(alls)
#with open("resepdata_seped.tsv","w")as f:
# f.write(texts)
"""
wakati = MeCab.Tagger("-Owakati")
#fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False)
#with open('fm.pickle', 'wb') as f:
# pickle.dump(fm, f)
texts = ""
sent = ""
sparate = []
label = []
ruiseki = 0
ruiseki2 = 0
for line in open("alldata.tsv"):
line = line.split("\t")
line = line[0].strip()
if line == "" or "サマリ" in line:
if sent == "":
continue
sent = wakati.parse(sent).split(" ")[:-1]
flag = 0
#print(sent,sparate)
for i in sent:
#print(i)
for j in sparate:
if ruiseki+len(i) > j and ruiseki < j:
#print(j)
label.append("1")
flag = 1
elif ruiseki+len(i) == j:
#print(j)
label.append("1")
flag = 1
if flag == 0:
label.append("0")
flag = 0
ruiseki += len(i)
#texts += i + " "
try:
texts += str(0) + " "
except KeyError:
print(i)
#texts += str(fm.vocab["<unk>"].index) + " "
label[-1] = "1"
texts = texts.rstrip() + "\t"
texts += " ".join(label) + "\n"
sent = ""
sparate = []
label = []
ruiseki = 0
ruiseki2 = 0
#print(texts)
continue
sent += line.strip()
ruiseki2 += len(line.strip())
sparate.append(ruiseki2)
with open("random_labbeled.tsv","w")as f:
f.write(texts)
"""
wakati = MeCab.Tagger("-Owakati -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
#fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300_space.vec', binary=False)
#with open('fm_space.pickle', 'wb') as f:
# pickle.dump(fm, f)
with open('fm_space.pickle', 'rb') as f:
fm = pickle.load(f)
texts = ""
sent = ""
sparate = []
label = []
ruiseki = 0
ruiseki2 = 0
for line in open("/clwork/ando/SEGBOT/alldata_resep.tsv"):
line = line.split("\t")
line = line[0].strip("\t").rstrip("\n")
#print(line)
if line == "" or "サマリ" in line:
if sent == "":
continue
print(sent)
sent = sent.replace(" ","<space>")
sent = wakati.parse(sent).split(" ")[:-1]
print(sent)
flag = 0
#print(sent,sparate)
for i in sent:
#print(i)
for j in sparate:
if ruiseki+len(i) > j and ruiseki < j:
#print(j)
label.append("1")
flag = 1
elif ruiseki+len(i) == j:
#print(j)
label.append("1")
flag = 1
if flag == 0:
label.append("0")
flag = 0
ruiseki += len(i)
#texts += i + " "
try:
texts += str(fm.vocab[i].index) + " "
#print(i,str(fm.vocab[i].index))
except KeyError:
texts += str(fm.vocab["<unk>"].index) + " "
label[-1] = "1"
texts = texts.rstrip() + "\t"
texts += " ".join(label) + "\n"
sent = ""
sparate = []
label = []
ruiseki = 0
ruiseki2 = 0
#print(texts)
continue
sent += line.strip("\t")
ruiseki2 += len(line)
sparate.append(ruiseki2)
with open("alldata2_space.tsv","w")as f:
f.write(texts)
"""
"""
wakati = MeCab.Tagger("-Owakati")
fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False)
texts = ""
sent = ""
cand = ""
sparate = []
label = []
ruiseki = 0
ruiseki2 = 0
flag2 = 1
for line in open("data2.tsv"):
line = line.split("\t")
if flag2 == 1:
cand = line
flag2 = 2
continue
if flag2 == 2:
flag2 = 1
#print(line,cand)
for n,z in enumerate(zip(cand,line)):
i = z[0]
j = z[1]
n = n+1
if i == "":
sent = wakati.parse(sent).split(" ")[:-1]
flag = 0
#print(sent,sparate)
for i in sent:
#print(i)
for j in sparate:
if ruiseki+len(i) > j and ruiseki < j:
#print(j)
label.append("1")
flag = 1
elif ruiseki+len(i) == j:
#print(j)
label.append("1")
flag = 1
if flag == 0:
label.append("0")
flag = 0
ruiseki += len(i)
#texts += i + " "
try:
texts += str(fm.vocab[i].index) + " "
except KeyError:
texts += str(fm.vocab["<unk>"].index) + " "
label[-1] = "1"
texts = texts.rstrip() + "\t"
texts += " ".join(label) + "\n"
sent = ""
sparate = []
label = []
ruiseki = 0
ruiseki2 = 0
#print(texts)
break
if j == "|":
sparate.append(n)
sent += i
with open("alldata.tsv","w")as f:
f.write(texts)
"""