Spaces:
Runtime error
Runtime error
import gensim | |
import MeCab | |
import pickle | |
from gensim.models.wrappers.fasttext import FastText | |
#import fasttext as ft | |
import random | |
import mojimoji | |
import numpy as np | |
from tqdm import tqdm | |
def ymyi(lis): | |
wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic") | |
with open('fm_space.pickle', 'rb') as f: | |
fm = pickle.load(f) | |
#model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False) | |
model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin") | |
texts = [] | |
sent = "" | |
sparate = [] | |
label = [] | |
ruiseki = 0 | |
ruiseki2 = 0 | |
alls = [] | |
labels, text, num = [], [], [] | |
for n, line in enumerate(open(lis)): | |
line = line.strip("\t").rstrip("\n") | |
#print(line) | |
if line == "": | |
if sent == "": | |
continue | |
sent = wakati.parse(sent).split(" ")[:-1] | |
flag = 0 | |
for i in sent: | |
for j in sparate: | |
if ruiseki+len(i) > j and ruiseki < j: | |
label.append(1) | |
flag = 1 | |
elif ruiseki+len(i) == j: | |
label.append(1) | |
flag = 1 | |
if flag == 0: | |
label.append(0) | |
flag = 0 | |
ruiseki += len(i) | |
#texts += i + " " | |
try: | |
texts.append(model[i]) | |
#texts.append(np.array(fm.vocab[i])) | |
#texts += str(fm.vocab[i].index) + " " | |
#print(i,str(fm.vocab[i].index)) | |
except KeyError: | |
texts.append(fm["<unk>"]) | |
label[-1] = 1 | |
#texts = texts.rstrip() + "\t" | |
#texts += " ".join(label) + "\n" | |
#alls.append((n,texts,label)) | |
labels.append(label) | |
text.append(texts) | |
num.append(n) | |
sent = "" | |
sparate = [] | |
texts = [] | |
label = [] | |
ruiseki = 0 | |
ruiseki2 = 0 | |
continue | |
sent += mojimoji.han_to_zen(line, digit=False, ascii=False) | |
ruiseki2 += len(line) | |
sparate.append(ruiseki2) | |
return num,text,labels | |
def nmni(lis): | |
#wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic") | |
wakati = MeCab.Tagger("-Owakati -b 81920") | |
with open('fm_space.pickle', 'rb') as f: | |
fm = pickle.load(f) | |
#model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False) | |
#model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin") | |
texts = [] | |
sent = "" | |
sparate = [] | |
label = [] | |
ruiseki = 0 | |
ruiseki2 = 0 | |
alls = [] | |
labels, text, num = [], [], [] | |
for n, line in enumerate(open(lis)): | |
line = line.strip("\t").rstrip("\n") | |
#print(line) | |
if line == "": | |
if sent == "": | |
continue | |
sent = wakati.parse(sent).split(" ")[:-1] | |
flag = 0 | |
for i in sent: | |
for j in sparate: | |
if ruiseki+len(i) > j and ruiseki < j: | |
label.append(1) | |
flag = 1 | |
elif ruiseki+len(i) == j: | |
label.append(1) | |
flag = 1 | |
if flag == 0: | |
label.append(0) | |
flag = 0 | |
ruiseki += len(i) | |
#texts += i + " " | |
try: | |
#texts.append(model[i]) | |
texts.append(fm[i]) | |
#texts += str(fm.vocab[i].index) + " " | |
#print(i,str(fm.vocab[i].index)) | |
except KeyError: | |
texts.append(fm["<unk>"]) | |
label[-1] = 1 | |
#texts = texts.rstrip() + "\t" | |
#texts += " ".join(label) + "\n" | |
#alls.append((n,texts,label)) | |
labels.append(label) | |
text.append(texts) | |
num.append(n) | |
sent = "" | |
sparate = [] | |
texts = [] | |
label = [] | |
ruiseki = 0 | |
ruiseki2 = 0 | |
continue | |
sent += mojimoji.han_to_zen(line, digit=False, ascii=False) | |
ruiseki2 += len(line) | |
sparate.append(ruiseki2) | |
return num,text,labels | |
def nmni_finetune(lis): | |
#wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic") | |
wakati = MeCab.Tagger("-Owakati -b 81920") | |
#fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False) | |
with open('fm.pickle', 'rb') as f: | |
fm = pickle.load(f) | |
#fm = gensim.models.KeyedVectors.load_word2vec_format('cc.ja.300.vec', binary=False) | |
#with open('fm.pickle', 'wb') as f: | |
# pickle.dump(fm, f) | |
#model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False) | |
#model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin") | |
texts = [] | |
sent = "" | |
sparate = [] | |
label = [] | |
ruiseki = 0 | |
ruiseki2 = 0 | |
alls = [] | |
labels, text, num = [], [], [] | |
for n, line in enumerate(open(lis)): | |
line = line.strip("\t").rstrip("\n") | |
#print(line) | |
if line == "": | |
if sent == "": | |
continue | |
sent = wakati.parse(sent).split(" ")[:-1] | |
flag = 0 | |
for i in sent: | |
for j in sparate: | |
if ruiseki+len(i) > j and ruiseki < j: | |
label.append(1) | |
flag = 1 | |
elif ruiseki+len(i) == j: | |
label.append(1) | |
flag = 1 | |
if flag == 0: | |
label.append(0) | |
flag = 0 | |
ruiseki += len(i) | |
#texts += i + " " | |
try: | |
#texts.append(model[i]) | |
#texts.append(fm[i]) | |
texts.append(fm.vocab[i].index) | |
#print(i,str(fm.vocab[i].index)) | |
except KeyError: | |
texts.append(fm.vocab["<unk>"].index) | |
label[-1] = 1 | |
#texts = texts.rstrip() + "\t" | |
#texts += " ".join(label) + "\n" | |
#alls.append((n,texts,label)) | |
labels.append(np.array(label)) | |
text.append(np.array(texts)) | |
num.append(n) | |
sent = "" | |
sparate = [] | |
texts = [] | |
label = [] | |
ruiseki = 0 | |
ruiseki2 = 0 | |
continue | |
sent += mojimoji.han_to_zen(line, digit=False, ascii=False) | |
ruiseki2 += len(line) | |
sparate.append(ruiseki2) | |
return text,labels | |
def nmni_carte(lis): | |
#wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic") | |
wakati = MeCab.Tagger("-Owakati -b 81920") | |
#fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False) | |
#fm = gensim.models.KeyedVectors.load_word2vec_format('cc.ja.300.vec', binary=False) | |
#with open('fm.pickle', 'wb') as f: | |
# pickle.dump(fm, f) | |
#model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False) | |
#model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin") | |
with open('fm.pickle', 'rb') as f: | |
fm = pickle.load(f) | |
texts = [] | |
sent = "" | |
sparate = [] | |
label = [] | |
ruiseki = 0 | |
ruiseki2 = 0 | |
alls = [] | |
labels, text, num = [], [], [] | |
allab, altex, fukugenss = [], [], [] | |
#for n in tqdm(range(26431)): | |
for n in tqdm(range(108)): | |
fukugens = [] | |
for line in open(lis+str(n)+".txt"): | |
line = line.strip() | |
if line == "": | |
continue | |
sent = wakati.parse(line).split(" ")[:-1] | |
flag = 0 | |
label = [] | |
texts = [] | |
fukugen = [] | |
for i in sent: | |
try: | |
texts.append(fm.vocab[i].index) | |
except KeyError: | |
texts.append(fm.vocab["<unk>"].index) | |
fukugen.append(i) | |
label.append(0) | |
label[-1] = 1 | |
labels.append(np.array(label)) | |
text.append(np.array(texts)) | |
#labels.append(label) | |
#text.append(texts) | |
fukugens.append(fukugen) | |
allab.append(labels) | |
altex.append(text) | |
fukugenss.append(fukugens) | |
labels, text, fukugens= [], [], [] | |
return altex, allab, fukugenss | |
def nmni_finetune_s(lis): | |
#wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic") | |
wakati = MeCab.Tagger("-Owakati -b 81920") | |
#fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False) | |
fm = gensim.models.KeyedVectors.load_word2vec_format('cc.ja.300.vec', binary=False) | |
with open('fm.pickle', 'wb') as f: | |
pickle.dump(fm, f) | |
#model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False) | |
#model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin") | |
texts = [] | |
sent = "" | |
sparate = [] | |
label = [] | |
ruiseki = 0 | |
ruiseki2 = 0 | |
alls = [] | |
labels, text, num = [], [], [] | |
for n, line in enumerate(open(lis)): | |
line = line.strip("\t").rstrip("\n") | |
sent = wakati.parse(line).split(" ")[:-1] | |
flag = 0 | |
label = [] | |
texts = [] | |
for i in sent: | |
try: | |
texts.append(fm.vocab[i].index) | |
except KeyError: | |
texts.append(fm.vocab["<unk>"].index) | |
label.append(0) | |
label[-1] = 1 | |
labels.append(np.array(label)) | |
text.append(np.array(texts)) | |
return text,labels | |
def nmni_finetune_ss(lis): | |
#wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic") | |
wakati = MeCab.Tagger("-Owakati -b 81920") | |
fm = gensim.models.KeyedVectors.load_word2vec_format('cc.ja.300.vec', binary=False) | |
with open('fm.pickle', 'wb') as f: | |
pickle.dump(fm, f) | |
#with open('fm.pickle', 'rb') as f: | |
# fm = pickle.load(f) | |
#model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False) | |
#model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin") | |
t,l =[],[] | |
for i in range(108): | |
texts = [] | |
sent = "" | |
sparate = [] | |
label = [] | |
ruiseki = 0 | |
ruiseki2 = 0 | |
alls = [] | |
labels, text, num = [], [], [] | |
for n, line in enumerate(open(lis+str(i)+".txt")): | |
line = line.strip("\t").rstrip("\n") | |
if line == "": | |
continue | |
sent = wakati.parse(line).split(" ")[:-1] | |
flag = 0 | |
label = [] | |
texts = [] | |
for i in sent: | |
try: | |
texts.append(fm.vocab[i].index) | |
except KeyError: | |
texts.append(fm.vocab["<unk>"].index) | |
label.append(0) | |
label[-1] = 1 | |
labels.append(np.array(label)) | |
text.append(np.array(texts)) | |
t.append(text) | |
l.append(labels) | |
return t,l | |
#model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin") | |
#print(model.get_subwords("間質性肺炎")) | |
#print(model.get_subwords("誤嚥性肺炎")) | |
#print(model.get_subwords("談話ユニット分割")) | |
""" | |
texts = [] | |
sent = "" | |
sparate = [] | |
label = [] | |
ruiseki = 0 | |
ruiseki2 = 0 | |
alls = [] | |
for n, line in enumerate(open("/clwork/ando/SEGBOT/randomdata.tsv")): | |
line = line.strip("\t").rstrip("\n") | |
if line == "": | |
if sent == "": | |
continue | |
alls.append(sent) | |
sent = "" | |
continue | |
else: | |
sent += line | |
if len(sent) != 0: | |
alls.append(sent) | |
random.shuffle(alls) | |
#v = random.sample(alls, 300) | |
#for i in v: | |
# alls.remove(i) | |
#t = random.sample(alls, 300) | |
#for i in t: | |
# alls.remove(i) | |
with open("randomdata_concat.tsv","a")as f: | |
f.write("\n".join()) | |
#with open("dev_fix.tsv","a")as f: | |
# for i in v: | |
# f.write("\n".join(i)) | |
# f.write("\n\n") | |
#with open("test_fix.tsv","a")as f: | |
# for i in t: | |
# f.write("\n".join(i)) | |
# f.write("\n\n") | |
""" | |
""" | |
out = "" | |
for line in open("/clwork/ando/SEGBOT_BERT/alldata2_bert.tsv"): | |
line = line.split("\t") | |
line = line[0].strip() | |
if line == "" or "サマリ" in line: | |
continue | |
out += line + "\n" | |
with open("alldata3.tsv","w")as f: | |
f.write(out) | |
""" | |
""" | |
#wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic") | |
wakati = MeCab.Tagger("-Owakati -b 81920") | |
with open('fm_space.pickle', 'rb') as f: | |
fm = pickle.load(f) | |
#model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False) | |
#model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin") | |
texts = [] | |
sent = "" | |
sparate = [] | |
label = [] | |
ruiseki = 0 | |
ruiseki2 = 0 | |
alls = [] | |
for n, line in enumerate(open("/clwork/ando/SEGBOT/train_fix.tsv")): | |
line = line.strip("\t").rstrip("\n") | |
#print(line) | |
if line == "": | |
if sent == "": | |
continue | |
sent = wakati.parse(sent).split(" ")[:-1] | |
flag = 0 | |
for i in sent: | |
for j in sparate: | |
if ruiseki+len(i) > j and ruiseki < j: | |
label.append(1) | |
flag = 1 | |
elif ruiseki+len(i) == j: | |
label.append(1) | |
flag = 1 | |
if flag == 0: | |
label.append(0) | |
flag = 0 | |
ruiseki += len(i) | |
#texts += i + " " | |
try: | |
#texts.append(model[i]) | |
texts.append(fm.vocab[i]) | |
#texts += str(fm.vocab[i].index) + " " | |
#print(i,str(fm.vocab[i].index)) | |
except KeyError: | |
texts.append(fm.vocab["<unk>"]) | |
print(i) | |
label[-1] = 1 | |
#texts = texts.rstrip() + "\t" | |
#texts += " ".join(label) + "\n" | |
alls.append((str(n),texts,label)) | |
sent = "" | |
sparate = [] | |
texts = [] | |
label = [] | |
ruiseki = 0 | |
ruiseki2 = 0 | |
continue | |
sent += mojimoji.han_to_zen(line, digit=False, ascii=False) | |
ruiseki2 += len(line) | |
sparate.append(ruiseki2) | |
with open('nm_ni/train.pickle', 'wb') as f: | |
pickle.dump(alls, f) | |
#print(alls) | |
#with open("resepdata_seped.tsv","w")as f: | |
# f.write(texts) | |
""" | |
wakati = MeCab.Tagger("-Owakati") | |
#fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False) | |
#with open('fm.pickle', 'wb') as f: | |
# pickle.dump(fm, f) | |
texts = "" | |
sent = "" | |
sparate = [] | |
label = [] | |
ruiseki = 0 | |
ruiseki2 = 0 | |
for line in open("alldata.tsv"): | |
line = line.split("\t") | |
line = line[0].strip() | |
if line == "" or "サマリ" in line: | |
if sent == "": | |
continue | |
sent = wakati.parse(sent).split(" ")[:-1] | |
flag = 0 | |
#print(sent,sparate) | |
for i in sent: | |
#print(i) | |
for j in sparate: | |
if ruiseki+len(i) > j and ruiseki < j: | |
#print(j) | |
label.append("1") | |
flag = 1 | |
elif ruiseki+len(i) == j: | |
#print(j) | |
label.append("1") | |
flag = 1 | |
if flag == 0: | |
label.append("0") | |
flag = 0 | |
ruiseki += len(i) | |
#texts += i + " " | |
try: | |
texts += str(0) + " " | |
except KeyError: | |
print(i) | |
#texts += str(fm.vocab["<unk>"].index) + " " | |
label[-1] = "1" | |
texts = texts.rstrip() + "\t" | |
texts += " ".join(label) + "\n" | |
sent = "" | |
sparate = [] | |
label = [] | |
ruiseki = 0 | |
ruiseki2 = 0 | |
#print(texts) | |
continue | |
sent += line.strip() | |
ruiseki2 += len(line.strip()) | |
sparate.append(ruiseki2) | |
with open("random_labbeled.tsv","w")as f: | |
f.write(texts) | |
""" | |
wakati = MeCab.Tagger("-Owakati -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic") | |
#fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300_space.vec', binary=False) | |
#with open('fm_space.pickle', 'wb') as f: | |
# pickle.dump(fm, f) | |
with open('fm_space.pickle', 'rb') as f: | |
fm = pickle.load(f) | |
texts = "" | |
sent = "" | |
sparate = [] | |
label = [] | |
ruiseki = 0 | |
ruiseki2 = 0 | |
for line in open("/clwork/ando/SEGBOT/alldata_resep.tsv"): | |
line = line.split("\t") | |
line = line[0].strip("\t").rstrip("\n") | |
#print(line) | |
if line == "" or "サマリ" in line: | |
if sent == "": | |
continue | |
print(sent) | |
sent = sent.replace(" ","<space>") | |
sent = wakati.parse(sent).split(" ")[:-1] | |
print(sent) | |
flag = 0 | |
#print(sent,sparate) | |
for i in sent: | |
#print(i) | |
for j in sparate: | |
if ruiseki+len(i) > j and ruiseki < j: | |
#print(j) | |
label.append("1") | |
flag = 1 | |
elif ruiseki+len(i) == j: | |
#print(j) | |
label.append("1") | |
flag = 1 | |
if flag == 0: | |
label.append("0") | |
flag = 0 | |
ruiseki += len(i) | |
#texts += i + " " | |
try: | |
texts += str(fm.vocab[i].index) + " " | |
#print(i,str(fm.vocab[i].index)) | |
except KeyError: | |
texts += str(fm.vocab["<unk>"].index) + " " | |
label[-1] = "1" | |
texts = texts.rstrip() + "\t" | |
texts += " ".join(label) + "\n" | |
sent = "" | |
sparate = [] | |
label = [] | |
ruiseki = 0 | |
ruiseki2 = 0 | |
#print(texts) | |
continue | |
sent += line.strip("\t") | |
ruiseki2 += len(line) | |
sparate.append(ruiseki2) | |
with open("alldata2_space.tsv","w")as f: | |
f.write(texts) | |
""" | |
""" | |
wakati = MeCab.Tagger("-Owakati") | |
fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False) | |
texts = "" | |
sent = "" | |
cand = "" | |
sparate = [] | |
label = [] | |
ruiseki = 0 | |
ruiseki2 = 0 | |
flag2 = 1 | |
for line in open("data2.tsv"): | |
line = line.split("\t") | |
if flag2 == 1: | |
cand = line | |
flag2 = 2 | |
continue | |
if flag2 == 2: | |
flag2 = 1 | |
#print(line,cand) | |
for n,z in enumerate(zip(cand,line)): | |
i = z[0] | |
j = z[1] | |
n = n+1 | |
if i == "": | |
sent = wakati.parse(sent).split(" ")[:-1] | |
flag = 0 | |
#print(sent,sparate) | |
for i in sent: | |
#print(i) | |
for j in sparate: | |
if ruiseki+len(i) > j and ruiseki < j: | |
#print(j) | |
label.append("1") | |
flag = 1 | |
elif ruiseki+len(i) == j: | |
#print(j) | |
label.append("1") | |
flag = 1 | |
if flag == 0: | |
label.append("0") | |
flag = 0 | |
ruiseki += len(i) | |
#texts += i + " " | |
try: | |
texts += str(fm.vocab[i].index) + " " | |
except KeyError: | |
texts += str(fm.vocab["<unk>"].index) + " " | |
label[-1] = "1" | |
texts = texts.rstrip() + "\t" | |
texts += " ".join(label) + "\n" | |
sent = "" | |
sparate = [] | |
label = [] | |
ruiseki = 0 | |
ruiseki2 = 0 | |
#print(texts) | |
break | |
if j == "|": | |
sparate.append(n) | |
sent += i | |
with open("alldata.tsv","w")as f: | |
f.write(texts) | |
""" | |