import gensim import MeCab import pickle from gensim.models.wrappers.fasttext import FastText #import fasttext as ft import random import mojimoji import numpy as np from tqdm import tqdm def ymyi(lis): wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic") with open('fm_space.pickle', 'rb') as f: fm = pickle.load(f) #model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False) model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin") texts = [] sent = "" sparate = [] label = [] ruiseki = 0 ruiseki2 = 0 alls = [] labels, text, num = [], [], [] for n, line in enumerate(open(lis)): line = line.strip("\t").rstrip("\n") #print(line) if line == "": if sent == "": continue sent = wakati.parse(sent).split(" ")[:-1] flag = 0 for i in sent: for j in sparate: if ruiseki+len(i) > j and ruiseki < j: label.append(1) flag = 1 elif ruiseki+len(i) == j: label.append(1) flag = 1 if flag == 0: label.append(0) flag = 0 ruiseki += len(i) #texts += i + " " try: texts.append(model[i]) #texts.append(np.array(fm.vocab[i])) #texts += str(fm.vocab[i].index) + " " #print(i,str(fm.vocab[i].index)) except KeyError: texts.append(fm[""]) label[-1] = 1 #texts = texts.rstrip() + "\t" #texts += " ".join(label) + "\n" #alls.append((n,texts,label)) labels.append(label) text.append(texts) num.append(n) sent = "" sparate = [] texts = [] label = [] ruiseki = 0 ruiseki2 = 0 continue sent += mojimoji.han_to_zen(line, digit=False, ascii=False) ruiseki2 += len(line) sparate.append(ruiseki2) return num,text,labels def nmni(lis): #wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic") wakati = MeCab.Tagger("-Owakati -b 81920") with open('fm_space.pickle', 'rb') as f: fm = pickle.load(f) #model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False) #model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin") texts = [] sent = "" sparate = [] label = [] ruiseki = 0 ruiseki2 = 0 alls = [] labels, text, num = [], [], [] for n, line in enumerate(open(lis)): line = line.strip("\t").rstrip("\n") #print(line) if line == "": if sent == "": continue sent = wakati.parse(sent).split(" ")[:-1] flag = 0 for i in sent: for j in sparate: if ruiseki+len(i) > j and ruiseki < j: label.append(1) flag = 1 elif ruiseki+len(i) == j: label.append(1) flag = 1 if flag == 0: label.append(0) flag = 0 ruiseki += len(i) #texts += i + " " try: #texts.append(model[i]) texts.append(fm[i]) #texts += str(fm.vocab[i].index) + " " #print(i,str(fm.vocab[i].index)) except KeyError: texts.append(fm[""]) label[-1] = 1 #texts = texts.rstrip() + "\t" #texts += " ".join(label) + "\n" #alls.append((n,texts,label)) labels.append(label) text.append(texts) num.append(n) sent = "" sparate = [] texts = [] label = [] ruiseki = 0 ruiseki2 = 0 continue sent += mojimoji.han_to_zen(line, digit=False, ascii=False) ruiseki2 += len(line) sparate.append(ruiseki2) return num,text,labels def nmni_finetune(lis): #wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic") wakati = MeCab.Tagger("-Owakati -b 81920") #fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False) with open('fm.pickle', 'rb') as f: fm = pickle.load(f) #fm = gensim.models.KeyedVectors.load_word2vec_format('cc.ja.300.vec', binary=False) #with open('fm.pickle', 'wb') as f: # pickle.dump(fm, f) #model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False) #model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin") texts = [] sent = "" sparate = [] label = [] ruiseki = 0 ruiseki2 = 0 alls = [] labels, text, num = [], [], [] for n, line in enumerate(open(lis)): line = line.strip("\t").rstrip("\n") #print(line) if line == "": if sent == "": continue sent = wakati.parse(sent).split(" ")[:-1] flag = 0 for i in sent: for j in sparate: if ruiseki+len(i) > j and ruiseki < j: label.append(1) flag = 1 elif ruiseki+len(i) == j: label.append(1) flag = 1 if flag == 0: label.append(0) flag = 0 ruiseki += len(i) #texts += i + " " try: #texts.append(model[i]) #texts.append(fm[i]) texts.append(fm.vocab[i].index) #print(i,str(fm.vocab[i].index)) except KeyError: texts.append(fm.vocab[""].index) label[-1] = 1 #texts = texts.rstrip() + "\t" #texts += " ".join(label) + "\n" #alls.append((n,texts,label)) labels.append(np.array(label)) text.append(np.array(texts)) num.append(n) sent = "" sparate = [] texts = [] label = [] ruiseki = 0 ruiseki2 = 0 continue sent += mojimoji.han_to_zen(line, digit=False, ascii=False) ruiseki2 += len(line) sparate.append(ruiseki2) return text,labels def nmni_carte(lis): #wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic") wakati = MeCab.Tagger("-Owakati -b 81920") #fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False) #fm = gensim.models.KeyedVectors.load_word2vec_format('cc.ja.300.vec', binary=False) #with open('fm.pickle', 'wb') as f: # pickle.dump(fm, f) #model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False) #model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin") with open('fm.pickle', 'rb') as f: fm = pickle.load(f) texts = [] sent = "" sparate = [] label = [] ruiseki = 0 ruiseki2 = 0 alls = [] labels, text, num = [], [], [] allab, altex, fukugenss = [], [], [] #for n in tqdm(range(26431)): for n in tqdm(range(108)): fukugens = [] for line in open(lis+str(n)+".txt"): line = line.strip() if line == "": continue sent = wakati.parse(line).split(" ")[:-1] flag = 0 label = [] texts = [] fukugen = [] for i in sent: try: texts.append(fm.vocab[i].index) except KeyError: texts.append(fm.vocab[""].index) fukugen.append(i) label.append(0) label[-1] = 1 labels.append(np.array(label)) text.append(np.array(texts)) #labels.append(label) #text.append(texts) fukugens.append(fukugen) allab.append(labels) altex.append(text) fukugenss.append(fukugens) labels, text, fukugens= [], [], [] return altex, allab, fukugenss def nmni_finetune_s(lis): #wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic") wakati = MeCab.Tagger("-Owakati -b 81920") #fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False) fm = gensim.models.KeyedVectors.load_word2vec_format('cc.ja.300.vec', binary=False) with open('fm.pickle', 'wb') as f: pickle.dump(fm, f) #model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False) #model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin") texts = [] sent = "" sparate = [] label = [] ruiseki = 0 ruiseki2 = 0 alls = [] labels, text, num = [], [], [] for n, line in enumerate(open(lis)): line = line.strip("\t").rstrip("\n") sent = wakati.parse(line).split(" ")[:-1] flag = 0 label = [] texts = [] for i in sent: try: texts.append(fm.vocab[i].index) except KeyError: texts.append(fm.vocab[""].index) label.append(0) label[-1] = 1 labels.append(np.array(label)) text.append(np.array(texts)) return text,labels def nmni_finetune_ss(lis): #wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic") wakati = MeCab.Tagger("-Owakati -b 81920") fm = gensim.models.KeyedVectors.load_word2vec_format('cc.ja.300.vec', binary=False) with open('fm.pickle', 'wb') as f: pickle.dump(fm, f) #with open('fm.pickle', 'rb') as f: # fm = pickle.load(f) #model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False) #model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin") t,l =[],[] for i in range(108): texts = [] sent = "" sparate = [] label = [] ruiseki = 0 ruiseki2 = 0 alls = [] labels, text, num = [], [], [] for n, line in enumerate(open(lis+str(i)+".txt")): line = line.strip("\t").rstrip("\n") if line == "": continue sent = wakati.parse(line).split(" ")[:-1] flag = 0 label = [] texts = [] for i in sent: try: texts.append(fm.vocab[i].index) except KeyError: texts.append(fm.vocab[""].index) label.append(0) label[-1] = 1 labels.append(np.array(label)) text.append(np.array(texts)) t.append(text) l.append(labels) return t,l #model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin") #print(model.get_subwords("間質性肺炎")) #print(model.get_subwords("誤嚥性肺炎")) #print(model.get_subwords("談話ユニット分割")) """ texts = [] sent = "" sparate = [] label = [] ruiseki = 0 ruiseki2 = 0 alls = [] for n, line in enumerate(open("/clwork/ando/SEGBOT/randomdata.tsv")): line = line.strip("\t").rstrip("\n") if line == "": if sent == "": continue alls.append(sent) sent = "" continue else: sent += line if len(sent) != 0: alls.append(sent) random.shuffle(alls) #v = random.sample(alls, 300) #for i in v: # alls.remove(i) #t = random.sample(alls, 300) #for i in t: # alls.remove(i) with open("randomdata_concat.tsv","a")as f: f.write("\n".join()) #with open("dev_fix.tsv","a")as f: # for i in v: # f.write("\n".join(i)) # f.write("\n\n") #with open("test_fix.tsv","a")as f: # for i in t: # f.write("\n".join(i)) # f.write("\n\n") """ """ out = "" for line in open("/clwork/ando/SEGBOT_BERT/alldata2_bert.tsv"): line = line.split("\t") line = line[0].strip() if line == "" or "サマリ" in line: continue out += line + "\n" with open("alldata3.tsv","w")as f: f.write(out) """ """ #wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic") wakati = MeCab.Tagger("-Owakati -b 81920") with open('fm_space.pickle', 'rb') as f: fm = pickle.load(f) #model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False) #model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin") texts = [] sent = "" sparate = [] label = [] ruiseki = 0 ruiseki2 = 0 alls = [] for n, line in enumerate(open("/clwork/ando/SEGBOT/train_fix.tsv")): line = line.strip("\t").rstrip("\n") #print(line) if line == "": if sent == "": continue sent = wakati.parse(sent).split(" ")[:-1] flag = 0 for i in sent: for j in sparate: if ruiseki+len(i) > j and ruiseki < j: label.append(1) flag = 1 elif ruiseki+len(i) == j: label.append(1) flag = 1 if flag == 0: label.append(0) flag = 0 ruiseki += len(i) #texts += i + " " try: #texts.append(model[i]) texts.append(fm.vocab[i]) #texts += str(fm.vocab[i].index) + " " #print(i,str(fm.vocab[i].index)) except KeyError: texts.append(fm.vocab[""]) print(i) label[-1] = 1 #texts = texts.rstrip() + "\t" #texts += " ".join(label) + "\n" alls.append((str(n),texts,label)) sent = "" sparate = [] texts = [] label = [] ruiseki = 0 ruiseki2 = 0 continue sent += mojimoji.han_to_zen(line, digit=False, ascii=False) ruiseki2 += len(line) sparate.append(ruiseki2) with open('nm_ni/train.pickle', 'wb') as f: pickle.dump(alls, f) #print(alls) #with open("resepdata_seped.tsv","w")as f: # f.write(texts) """ wakati = MeCab.Tagger("-Owakati") #fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False) #with open('fm.pickle', 'wb') as f: # pickle.dump(fm, f) texts = "" sent = "" sparate = [] label = [] ruiseki = 0 ruiseki2 = 0 for line in open("alldata.tsv"): line = line.split("\t") line = line[0].strip() if line == "" or "サマリ" in line: if sent == "": continue sent = wakati.parse(sent).split(" ")[:-1] flag = 0 #print(sent,sparate) for i in sent: #print(i) for j in sparate: if ruiseki+len(i) > j and ruiseki < j: #print(j) label.append("1") flag = 1 elif ruiseki+len(i) == j: #print(j) label.append("1") flag = 1 if flag == 0: label.append("0") flag = 0 ruiseki += len(i) #texts += i + " " try: texts += str(0) + " " except KeyError: print(i) #texts += str(fm.vocab[""].index) + " " label[-1] = "1" texts = texts.rstrip() + "\t" texts += " ".join(label) + "\n" sent = "" sparate = [] label = [] ruiseki = 0 ruiseki2 = 0 #print(texts) continue sent += line.strip() ruiseki2 += len(line.strip()) sparate.append(ruiseki2) with open("random_labbeled.tsv","w")as f: f.write(texts) """ wakati = MeCab.Tagger("-Owakati -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic") #fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300_space.vec', binary=False) #with open('fm_space.pickle', 'wb') as f: # pickle.dump(fm, f) with open('fm_space.pickle', 'rb') as f: fm = pickle.load(f) texts = "" sent = "" sparate = [] label = [] ruiseki = 0 ruiseki2 = 0 for line in open("/clwork/ando/SEGBOT/alldata_resep.tsv"): line = line.split("\t") line = line[0].strip("\t").rstrip("\n") #print(line) if line == "" or "サマリ" in line: if sent == "": continue print(sent) sent = sent.replace(" ","") sent = wakati.parse(sent).split(" ")[:-1] print(sent) flag = 0 #print(sent,sparate) for i in sent: #print(i) for j in sparate: if ruiseki+len(i) > j and ruiseki < j: #print(j) label.append("1") flag = 1 elif ruiseki+len(i) == j: #print(j) label.append("1") flag = 1 if flag == 0: label.append("0") flag = 0 ruiseki += len(i) #texts += i + " " try: texts += str(fm.vocab[i].index) + " " #print(i,str(fm.vocab[i].index)) except KeyError: texts += str(fm.vocab[""].index) + " " label[-1] = "1" texts = texts.rstrip() + "\t" texts += " ".join(label) + "\n" sent = "" sparate = [] label = [] ruiseki = 0 ruiseki2 = 0 #print(texts) continue sent += line.strip("\t") ruiseki2 += len(line) sparate.append(ruiseki2) with open("alldata2_space.tsv","w")as f: f.write(texts) """ """ wakati = MeCab.Tagger("-Owakati") fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False) texts = "" sent = "" cand = "" sparate = [] label = [] ruiseki = 0 ruiseki2 = 0 flag2 = 1 for line in open("data2.tsv"): line = line.split("\t") if flag2 == 1: cand = line flag2 = 2 continue if flag2 == 2: flag2 = 1 #print(line,cand) for n,z in enumerate(zip(cand,line)): i = z[0] j = z[1] n = n+1 if i == "": sent = wakati.parse(sent).split(" ")[:-1] flag = 0 #print(sent,sparate) for i in sent: #print(i) for j in sparate: if ruiseki+len(i) > j and ruiseki < j: #print(j) label.append("1") flag = 1 elif ruiseki+len(i) == j: #print(j) label.append("1") flag = 1 if flag == 0: label.append("0") flag = 0 ruiseki += len(i) #texts += i + " " try: texts += str(fm.vocab[i].index) + " " except KeyError: texts += str(fm.vocab[""].index) + " " label[-1] = "1" texts = texts.rstrip() + "\t" texts += " ".join(label) + "\n" sent = "" sparate = [] label = [] ruiseki = 0 ruiseki2 = 0 #print(texts) break if j == "|": sparate.append(n) sent += i with open("alldata.tsv","w")as f: f.write(texts) """