Spaces:
Runtime error
Runtime error
Delete credata.py
Browse files- credata.py +0 -653
credata.py
DELETED
@@ -1,653 +0,0 @@
|
|
1 |
-
import gensim
|
2 |
-
import MeCab
|
3 |
-
import pickle
|
4 |
-
from gensim.models.wrappers.fasttext import FastText
|
5 |
-
#import fasttext as ft
|
6 |
-
import random
|
7 |
-
import mojimoji
|
8 |
-
import numpy as np
|
9 |
-
from tqdm import tqdm
|
10 |
-
|
11 |
-
def ymyi(lis):
|
12 |
-
wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
|
13 |
-
|
14 |
-
with open('fm_space.pickle', 'rb') as f:
|
15 |
-
fm = pickle.load(f)
|
16 |
-
#model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
|
17 |
-
model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
|
18 |
-
texts = []
|
19 |
-
sent = ""
|
20 |
-
sparate = []
|
21 |
-
label = []
|
22 |
-
ruiseki = 0
|
23 |
-
ruiseki2 = 0
|
24 |
-
alls = []
|
25 |
-
labels, text, num = [], [], []
|
26 |
-
for n, line in enumerate(open(lis)):
|
27 |
-
line = line.strip("\t").rstrip("\n")
|
28 |
-
#print(line)
|
29 |
-
if line == "":
|
30 |
-
if sent == "":
|
31 |
-
continue
|
32 |
-
sent = wakati.parse(sent).split(" ")[:-1]
|
33 |
-
flag = 0
|
34 |
-
for i in sent:
|
35 |
-
for j in sparate:
|
36 |
-
if ruiseki+len(i) > j and ruiseki < j:
|
37 |
-
label.append(1)
|
38 |
-
flag = 1
|
39 |
-
elif ruiseki+len(i) == j:
|
40 |
-
label.append(1)
|
41 |
-
flag = 1
|
42 |
-
if flag == 0:
|
43 |
-
label.append(0)
|
44 |
-
flag = 0
|
45 |
-
ruiseki += len(i)
|
46 |
-
#texts += i + " "
|
47 |
-
try:
|
48 |
-
texts.append(model[i])
|
49 |
-
#texts.append(np.array(fm.vocab[i]))
|
50 |
-
#texts += str(fm.vocab[i].index) + " "
|
51 |
-
#print(i,str(fm.vocab[i].index))
|
52 |
-
except KeyError:
|
53 |
-
texts.append(fm["<unk>"])
|
54 |
-
label[-1] = 1
|
55 |
-
#texts = texts.rstrip() + "\t"
|
56 |
-
#texts += " ".join(label) + "\n"
|
57 |
-
#alls.append((n,texts,label))
|
58 |
-
labels.append(label)
|
59 |
-
text.append(texts)
|
60 |
-
num.append(n)
|
61 |
-
sent = ""
|
62 |
-
sparate = []
|
63 |
-
texts = []
|
64 |
-
label = []
|
65 |
-
ruiseki = 0
|
66 |
-
ruiseki2 = 0
|
67 |
-
continue
|
68 |
-
sent += mojimoji.han_to_zen(line, digit=False, ascii=False)
|
69 |
-
ruiseki2 += len(line)
|
70 |
-
sparate.append(ruiseki2)
|
71 |
-
return num,text,labels
|
72 |
-
|
73 |
-
def nmni(lis):
|
74 |
-
#wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
|
75 |
-
wakati = MeCab.Tagger("-Owakati -b 81920")
|
76 |
-
|
77 |
-
with open('fm_space.pickle', 'rb') as f:
|
78 |
-
fm = pickle.load(f)
|
79 |
-
#model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
|
80 |
-
#model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
|
81 |
-
texts = []
|
82 |
-
sent = ""
|
83 |
-
sparate = []
|
84 |
-
label = []
|
85 |
-
ruiseki = 0
|
86 |
-
ruiseki2 = 0
|
87 |
-
alls = []
|
88 |
-
labels, text, num = [], [], []
|
89 |
-
for n, line in enumerate(open(lis)):
|
90 |
-
line = line.strip("\t").rstrip("\n")
|
91 |
-
#print(line)
|
92 |
-
if line == "":
|
93 |
-
if sent == "":
|
94 |
-
continue
|
95 |
-
sent = wakati.parse(sent).split(" ")[:-1]
|
96 |
-
flag = 0
|
97 |
-
for i in sent:
|
98 |
-
for j in sparate:
|
99 |
-
if ruiseki+len(i) > j and ruiseki < j:
|
100 |
-
label.append(1)
|
101 |
-
flag = 1
|
102 |
-
elif ruiseki+len(i) == j:
|
103 |
-
label.append(1)
|
104 |
-
flag = 1
|
105 |
-
if flag == 0:
|
106 |
-
label.append(0)
|
107 |
-
flag = 0
|
108 |
-
ruiseki += len(i)
|
109 |
-
#texts += i + " "
|
110 |
-
try:
|
111 |
-
#texts.append(model[i])
|
112 |
-
texts.append(fm[i])
|
113 |
-
#texts += str(fm.vocab[i].index) + " "
|
114 |
-
#print(i,str(fm.vocab[i].index))
|
115 |
-
except KeyError:
|
116 |
-
texts.append(fm["<unk>"])
|
117 |
-
label[-1] = 1
|
118 |
-
#texts = texts.rstrip() + "\t"
|
119 |
-
#texts += " ".join(label) + "\n"
|
120 |
-
#alls.append((n,texts,label))
|
121 |
-
labels.append(label)
|
122 |
-
text.append(texts)
|
123 |
-
num.append(n)
|
124 |
-
sent = ""
|
125 |
-
sparate = []
|
126 |
-
texts = []
|
127 |
-
label = []
|
128 |
-
ruiseki = 0
|
129 |
-
ruiseki2 = 0
|
130 |
-
continue
|
131 |
-
sent += mojimoji.han_to_zen(line, digit=False, ascii=False)
|
132 |
-
ruiseki2 += len(line)
|
133 |
-
sparate.append(ruiseki2)
|
134 |
-
return num,text,labels
|
135 |
-
|
136 |
-
def nmni_finetune(lis):
|
137 |
-
#wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
|
138 |
-
wakati = MeCab.Tagger("-Owakati -b 81920")
|
139 |
-
#fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False)
|
140 |
-
with open('fm.pickle', 'rb') as f:
|
141 |
-
fm = pickle.load(f)
|
142 |
-
#fm = gensim.models.KeyedVectors.load_word2vec_format('cc.ja.300.vec', binary=False)
|
143 |
-
#with open('fm.pickle', 'wb') as f:
|
144 |
-
# pickle.dump(fm, f)
|
145 |
-
#model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
|
146 |
-
#model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
|
147 |
-
texts = []
|
148 |
-
sent = ""
|
149 |
-
sparate = []
|
150 |
-
label = []
|
151 |
-
ruiseki = 0
|
152 |
-
ruiseki2 = 0
|
153 |
-
alls = []
|
154 |
-
labels, text, num = [], [], []
|
155 |
-
for n, line in enumerate(open(lis)):
|
156 |
-
line = line.strip("\t").rstrip("\n")
|
157 |
-
#print(line)
|
158 |
-
if line == "":
|
159 |
-
if sent == "":
|
160 |
-
continue
|
161 |
-
sent = wakati.parse(sent).split(" ")[:-1]
|
162 |
-
flag = 0
|
163 |
-
for i in sent:
|
164 |
-
for j in sparate:
|
165 |
-
if ruiseki+len(i) > j and ruiseki < j:
|
166 |
-
label.append(1)
|
167 |
-
flag = 1
|
168 |
-
elif ruiseki+len(i) == j:
|
169 |
-
label.append(1)
|
170 |
-
flag = 1
|
171 |
-
if flag == 0:
|
172 |
-
label.append(0)
|
173 |
-
flag = 0
|
174 |
-
ruiseki += len(i)
|
175 |
-
#texts += i + " "
|
176 |
-
try:
|
177 |
-
#texts.append(model[i])
|
178 |
-
#texts.append(fm[i])
|
179 |
-
texts.append(fm.vocab[i].index)
|
180 |
-
#print(i,str(fm.vocab[i].index))
|
181 |
-
except KeyError:
|
182 |
-
texts.append(fm.vocab["<unk>"].index)
|
183 |
-
label[-1] = 1
|
184 |
-
#texts = texts.rstrip() + "\t"
|
185 |
-
#texts += " ".join(label) + "\n"
|
186 |
-
#alls.append((n,texts,label))
|
187 |
-
labels.append(np.array(label))
|
188 |
-
text.append(np.array(texts))
|
189 |
-
num.append(n)
|
190 |
-
sent = ""
|
191 |
-
sparate = []
|
192 |
-
texts = []
|
193 |
-
label = []
|
194 |
-
ruiseki = 0
|
195 |
-
ruiseki2 = 0
|
196 |
-
continue
|
197 |
-
sent += mojimoji.han_to_zen(line, digit=False, ascii=False)
|
198 |
-
ruiseki2 += len(line)
|
199 |
-
sparate.append(ruiseki2)
|
200 |
-
return text,labels
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
def nmni_carte(lis):
|
205 |
-
#wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
|
206 |
-
wakati = MeCab.Tagger("-Owakati -b 81920")
|
207 |
-
#fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False)
|
208 |
-
#fm = gensim.models.KeyedVectors.load_word2vec_format('cc.ja.300.vec', binary=False)
|
209 |
-
#with open('fm.pickle', 'wb') as f:
|
210 |
-
# pickle.dump(fm, f)
|
211 |
-
#model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
|
212 |
-
#model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
|
213 |
-
with open('fm.pickle', 'rb') as f:
|
214 |
-
fm = pickle.load(f)
|
215 |
-
texts = []
|
216 |
-
sent = ""
|
217 |
-
sparate = []
|
218 |
-
label = []
|
219 |
-
ruiseki = 0
|
220 |
-
ruiseki2 = 0
|
221 |
-
alls = []
|
222 |
-
labels, text, num = [], [], []
|
223 |
-
allab, altex, fukugenss = [], [], []
|
224 |
-
#for n in tqdm(range(26431)):
|
225 |
-
for n in tqdm(range(108)):
|
226 |
-
fukugens = []
|
227 |
-
for line in open(lis+str(n)+".txt"):
|
228 |
-
line = line.strip()
|
229 |
-
if line == "":
|
230 |
-
continue
|
231 |
-
sent = wakati.parse(line).split(" ")[:-1]
|
232 |
-
flag = 0
|
233 |
-
label = []
|
234 |
-
texts = []
|
235 |
-
fukugen = []
|
236 |
-
for i in sent:
|
237 |
-
try:
|
238 |
-
texts.append(fm.vocab[i].index)
|
239 |
-
except KeyError:
|
240 |
-
texts.append(fm.vocab["<unk>"].index)
|
241 |
-
fukugen.append(i)
|
242 |
-
label.append(0)
|
243 |
-
label[-1] = 1
|
244 |
-
labels.append(np.array(label))
|
245 |
-
text.append(np.array(texts))
|
246 |
-
#labels.append(label)
|
247 |
-
#text.append(texts)
|
248 |
-
fukugens.append(fukugen)
|
249 |
-
allab.append(labels)
|
250 |
-
altex.append(text)
|
251 |
-
fukugenss.append(fukugens)
|
252 |
-
labels, text, fukugens= [], [], []
|
253 |
-
return altex, allab, fukugenss
|
254 |
-
|
255 |
-
|
256 |
-
def nmni_finetune_s(lis):
|
257 |
-
#wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
|
258 |
-
wakati = MeCab.Tagger("-Owakati -b 81920")
|
259 |
-
#fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False)
|
260 |
-
fm = gensim.models.KeyedVectors.load_word2vec_format('cc.ja.300.vec', binary=False)
|
261 |
-
with open('fm.pickle', 'wb') as f:
|
262 |
-
pickle.dump(fm, f)
|
263 |
-
#model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
|
264 |
-
#model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
|
265 |
-
texts = []
|
266 |
-
sent = ""
|
267 |
-
sparate = []
|
268 |
-
label = []
|
269 |
-
ruiseki = 0
|
270 |
-
ruiseki2 = 0
|
271 |
-
alls = []
|
272 |
-
labels, text, num = [], [], []
|
273 |
-
for n, line in enumerate(open(lis)):
|
274 |
-
line = line.strip("\t").rstrip("\n")
|
275 |
-
sent = wakati.parse(line).split(" ")[:-1]
|
276 |
-
flag = 0
|
277 |
-
label = []
|
278 |
-
texts = []
|
279 |
-
for i in sent:
|
280 |
-
try:
|
281 |
-
texts.append(fm.vocab[i].index)
|
282 |
-
except KeyError:
|
283 |
-
texts.append(fm.vocab["<unk>"].index)
|
284 |
-
label.append(0)
|
285 |
-
label[-1] = 1
|
286 |
-
labels.append(np.array(label))
|
287 |
-
text.append(np.array(texts))
|
288 |
-
return text,labels
|
289 |
-
|
290 |
-
|
291 |
-
def nmni_finetune_ss(lis):
|
292 |
-
#wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
|
293 |
-
wakati = MeCab.Tagger("-Owakati -b 81920")
|
294 |
-
fm = gensim.models.KeyedVectors.load_word2vec_format('cc.ja.300.vec', binary=False)
|
295 |
-
with open('fm.pickle', 'wb') as f:
|
296 |
-
pickle.dump(fm, f)
|
297 |
-
#with open('fm.pickle', 'rb') as f:
|
298 |
-
# fm = pickle.load(f)
|
299 |
-
#model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
|
300 |
-
#model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
|
301 |
-
t,l =[],[]
|
302 |
-
for i in range(108):
|
303 |
-
texts = []
|
304 |
-
sent = ""
|
305 |
-
sparate = []
|
306 |
-
label = []
|
307 |
-
ruiseki = 0
|
308 |
-
ruiseki2 = 0
|
309 |
-
alls = []
|
310 |
-
labels, text, num = [], [], []
|
311 |
-
for n, line in enumerate(open(lis+str(i)+".txt")):
|
312 |
-
line = line.strip("\t").rstrip("\n")
|
313 |
-
if line == "":
|
314 |
-
continue
|
315 |
-
sent = wakati.parse(line).split(" ")[:-1]
|
316 |
-
flag = 0
|
317 |
-
label = []
|
318 |
-
texts = []
|
319 |
-
for i in sent:
|
320 |
-
try:
|
321 |
-
texts.append(fm.vocab[i].index)
|
322 |
-
except KeyError:
|
323 |
-
texts.append(fm.vocab["<unk>"].index)
|
324 |
-
label.append(0)
|
325 |
-
label[-1] = 1
|
326 |
-
labels.append(np.array(label))
|
327 |
-
text.append(np.array(texts))
|
328 |
-
t.append(text)
|
329 |
-
l.append(labels)
|
330 |
-
return t,l
|
331 |
-
|
332 |
-
#model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
|
333 |
-
#print(model.get_subwords("間質性肺炎"))
|
334 |
-
#print(model.get_subwords("誤嚥性肺炎"))
|
335 |
-
#print(model.get_subwords("談話ユニット分割"))
|
336 |
-
|
337 |
-
"""
|
338 |
-
texts = []
|
339 |
-
sent = ""
|
340 |
-
sparate = []
|
341 |
-
label = []
|
342 |
-
ruiseki = 0
|
343 |
-
ruiseki2 = 0
|
344 |
-
alls = []
|
345 |
-
for n, line in enumerate(open("/clwork/ando/SEGBOT/randomdata.tsv")):
|
346 |
-
line = line.strip("\t").rstrip("\n")
|
347 |
-
if line == "":
|
348 |
-
if sent == "":
|
349 |
-
continue
|
350 |
-
alls.append(sent)
|
351 |
-
sent = ""
|
352 |
-
continue
|
353 |
-
else:
|
354 |
-
sent += line
|
355 |
-
if len(sent) != 0:
|
356 |
-
alls.append(sent)
|
357 |
-
random.shuffle(alls)
|
358 |
-
#v = random.sample(alls, 300)
|
359 |
-
#for i in v:
|
360 |
-
# alls.remove(i)
|
361 |
-
#t = random.sample(alls, 300)
|
362 |
-
#for i in t:
|
363 |
-
# alls.remove(i)
|
364 |
-
with open("randomdata_concat.tsv","a")as f:
|
365 |
-
f.write("\n".join())
|
366 |
-
#with open("dev_fix.tsv","a")as f:
|
367 |
-
# for i in v:
|
368 |
-
# f.write("\n".join(i))
|
369 |
-
# f.write("\n\n")
|
370 |
-
#with open("test_fix.tsv","a")as f:
|
371 |
-
# for i in t:
|
372 |
-
# f.write("\n".join(i))
|
373 |
-
# f.write("\n\n")
|
374 |
-
"""
|
375 |
-
|
376 |
-
"""
|
377 |
-
out = ""
|
378 |
-
for line in open("/clwork/ando/SEGBOT_BERT/alldata2_bert.tsv"):
|
379 |
-
line = line.split("\t")
|
380 |
-
line = line[0].strip()
|
381 |
-
if line == "" or "サマリ" in line:
|
382 |
-
continue
|
383 |
-
out += line + "\n"
|
384 |
-
with open("alldata3.tsv","w")as f:
|
385 |
-
f.write(out)
|
386 |
-
"""
|
387 |
-
"""
|
388 |
-
#wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
|
389 |
-
wakati = MeCab.Tagger("-Owakati -b 81920")
|
390 |
-
|
391 |
-
with open('fm_space.pickle', 'rb') as f:
|
392 |
-
fm = pickle.load(f)
|
393 |
-
#model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
|
394 |
-
#model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
|
395 |
-
texts = []
|
396 |
-
sent = ""
|
397 |
-
sparate = []
|
398 |
-
label = []
|
399 |
-
ruiseki = 0
|
400 |
-
ruiseki2 = 0
|
401 |
-
alls = []
|
402 |
-
for n, line in enumerate(open("/clwork/ando/SEGBOT/train_fix.tsv")):
|
403 |
-
line = line.strip("\t").rstrip("\n")
|
404 |
-
#print(line)
|
405 |
-
if line == "":
|
406 |
-
if sent == "":
|
407 |
-
continue
|
408 |
-
sent = wakati.parse(sent).split(" ")[:-1]
|
409 |
-
flag = 0
|
410 |
-
for i in sent:
|
411 |
-
for j in sparate:
|
412 |
-
if ruiseki+len(i) > j and ruiseki < j:
|
413 |
-
label.append(1)
|
414 |
-
flag = 1
|
415 |
-
elif ruiseki+len(i) == j:
|
416 |
-
label.append(1)
|
417 |
-
flag = 1
|
418 |
-
if flag == 0:
|
419 |
-
label.append(0)
|
420 |
-
flag = 0
|
421 |
-
ruiseki += len(i)
|
422 |
-
#texts += i + " "
|
423 |
-
try:
|
424 |
-
#texts.append(model[i])
|
425 |
-
texts.append(fm.vocab[i])
|
426 |
-
#texts += str(fm.vocab[i].index) + " "
|
427 |
-
#print(i,str(fm.vocab[i].index))
|
428 |
-
except KeyError:
|
429 |
-
texts.append(fm.vocab["<unk>"])
|
430 |
-
print(i)
|
431 |
-
label[-1] = 1
|
432 |
-
#texts = texts.rstrip() + "\t"
|
433 |
-
#texts += " ".join(label) + "\n"
|
434 |
-
alls.append((str(n),texts,label))
|
435 |
-
sent = ""
|
436 |
-
sparate = []
|
437 |
-
texts = []
|
438 |
-
label = []
|
439 |
-
ruiseki = 0
|
440 |
-
ruiseki2 = 0
|
441 |
-
continue
|
442 |
-
sent += mojimoji.han_to_zen(line, digit=False, ascii=False)
|
443 |
-
ruiseki2 += len(line)
|
444 |
-
sparate.append(ruiseki2)
|
445 |
-
with open('nm_ni/train.pickle', 'wb') as f:
|
446 |
-
pickle.dump(alls, f)
|
447 |
-
#print(alls)
|
448 |
-
#with open("resepdata_seped.tsv","w")as f:
|
449 |
-
# f.write(texts)
|
450 |
-
"""
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
wakati = MeCab.Tagger("-Owakati")
|
455 |
-
|
456 |
-
#fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False)
|
457 |
-
#with open('fm.pickle', 'wb') as f:
|
458 |
-
# pickle.dump(fm, f)
|
459 |
-
texts = ""
|
460 |
-
sent = ""
|
461 |
-
sparate = []
|
462 |
-
label = []
|
463 |
-
ruiseki = 0
|
464 |
-
ruiseki2 = 0
|
465 |
-
for line in open("alldata.tsv"):
|
466 |
-
line = line.split("\t")
|
467 |
-
line = line[0].strip()
|
468 |
-
if line == "" or "サマリ" in line:
|
469 |
-
if sent == "":
|
470 |
-
continue
|
471 |
-
sent = wakati.parse(sent).split(" ")[:-1]
|
472 |
-
flag = 0
|
473 |
-
#print(sent,sparate)
|
474 |
-
for i in sent:
|
475 |
-
#print(i)
|
476 |
-
for j in sparate:
|
477 |
-
if ruiseki+len(i) > j and ruiseki < j:
|
478 |
-
#print(j)
|
479 |
-
label.append("1")
|
480 |
-
flag = 1
|
481 |
-
elif ruiseki+len(i) == j:
|
482 |
-
#print(j)
|
483 |
-
label.append("1")
|
484 |
-
flag = 1
|
485 |
-
if flag == 0:
|
486 |
-
label.append("0")
|
487 |
-
flag = 0
|
488 |
-
ruiseki += len(i)
|
489 |
-
#texts += i + " "
|
490 |
-
|
491 |
-
try:
|
492 |
-
texts += str(0) + " "
|
493 |
-
except KeyError:
|
494 |
-
print(i)
|
495 |
-
#texts += str(fm.vocab["<unk>"].index) + " "
|
496 |
-
|
497 |
-
label[-1] = "1"
|
498 |
-
texts = texts.rstrip() + "\t"
|
499 |
-
texts += " ".join(label) + "\n"
|
500 |
-
sent = ""
|
501 |
-
sparate = []
|
502 |
-
label = []
|
503 |
-
ruiseki = 0
|
504 |
-
ruiseki2 = 0
|
505 |
-
#print(texts)
|
506 |
-
continue
|
507 |
-
sent += line.strip()
|
508 |
-
ruiseki2 += len(line.strip())
|
509 |
-
sparate.append(ruiseki2)
|
510 |
-
with open("random_labbeled.tsv","w")as f:
|
511 |
-
f.write(texts)
|
512 |
-
|
513 |
-
|
514 |
-
|
515 |
-
|
516 |
-
|
517 |
-
"""
|
518 |
-
wakati = MeCab.Tagger("-Owakati -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
|
519 |
-
|
520 |
-
|
521 |
-
#fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300_space.vec', binary=False)
|
522 |
-
#with open('fm_space.pickle', 'wb') as f:
|
523 |
-
# pickle.dump(fm, f)
|
524 |
-
|
525 |
-
with open('fm_space.pickle', 'rb') as f:
|
526 |
-
fm = pickle.load(f)
|
527 |
-
texts = ""
|
528 |
-
sent = ""
|
529 |
-
sparate = []
|
530 |
-
label = []
|
531 |
-
ruiseki = 0
|
532 |
-
ruiseki2 = 0
|
533 |
-
for line in open("/clwork/ando/SEGBOT/alldata_resep.tsv"):
|
534 |
-
line = line.split("\t")
|
535 |
-
line = line[0].strip("\t").rstrip("\n")
|
536 |
-
#print(line)
|
537 |
-
if line == "" or "サマリ" in line:
|
538 |
-
if sent == "":
|
539 |
-
continue
|
540 |
-
print(sent)
|
541 |
-
sent = sent.replace(" ","<space>")
|
542 |
-
sent = wakati.parse(sent).split(" ")[:-1]
|
543 |
-
print(sent)
|
544 |
-
flag = 0
|
545 |
-
#print(sent,sparate)
|
546 |
-
for i in sent:
|
547 |
-
#print(i)
|
548 |
-
for j in sparate:
|
549 |
-
if ruiseki+len(i) > j and ruiseki < j:
|
550 |
-
#print(j)
|
551 |
-
label.append("1")
|
552 |
-
flag = 1
|
553 |
-
elif ruiseki+len(i) == j:
|
554 |
-
#print(j)
|
555 |
-
label.append("1")
|
556 |
-
flag = 1
|
557 |
-
if flag == 0:
|
558 |
-
label.append("0")
|
559 |
-
flag = 0
|
560 |
-
ruiseki += len(i)
|
561 |
-
#texts += i + " "
|
562 |
-
|
563 |
-
try:
|
564 |
-
texts += str(fm.vocab[i].index) + " "
|
565 |
-
#print(i,str(fm.vocab[i].index))
|
566 |
-
except KeyError:
|
567 |
-
texts += str(fm.vocab["<unk>"].index) + " "
|
568 |
-
label[-1] = "1"
|
569 |
-
texts = texts.rstrip() + "\t"
|
570 |
-
texts += " ".join(label) + "\n"
|
571 |
-
sent = ""
|
572 |
-
sparate = []
|
573 |
-
label = []
|
574 |
-
ruiseki = 0
|
575 |
-
ruiseki2 = 0
|
576 |
-
#print(texts)
|
577 |
-
continue
|
578 |
-
sent += line.strip("\t")
|
579 |
-
ruiseki2 += len(line)
|
580 |
-
sparate.append(ruiseki2)
|
581 |
-
with open("alldata2_space.tsv","w")as f:
|
582 |
-
f.write(texts)
|
583 |
-
"""
|
584 |
-
|
585 |
-
|
586 |
-
|
587 |
-
"""
|
588 |
-
wakati = MeCab.Tagger("-Owakati")
|
589 |
-
|
590 |
-
fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False)
|
591 |
-
texts = ""
|
592 |
-
sent = ""
|
593 |
-
cand = ""
|
594 |
-
sparate = []
|
595 |
-
label = []
|
596 |
-
ruiseki = 0
|
597 |
-
ruiseki2 = 0
|
598 |
-
flag2 = 1
|
599 |
-
for line in open("data2.tsv"):
|
600 |
-
line = line.split("\t")
|
601 |
-
if flag2 == 1:
|
602 |
-
cand = line
|
603 |
-
flag2 = 2
|
604 |
-
continue
|
605 |
-
if flag2 == 2:
|
606 |
-
flag2 = 1
|
607 |
-
#print(line,cand)
|
608 |
-
for n,z in enumerate(zip(cand,line)):
|
609 |
-
i = z[0]
|
610 |
-
j = z[1]
|
611 |
-
n = n+1
|
612 |
-
if i == "":
|
613 |
-
sent = wakati.parse(sent).split(" ")[:-1]
|
614 |
-
flag = 0
|
615 |
-
#print(sent,sparate)
|
616 |
-
for i in sent:
|
617 |
-
#print(i)
|
618 |
-
for j in sparate:
|
619 |
-
if ruiseki+len(i) > j and ruiseki < j:
|
620 |
-
#print(j)
|
621 |
-
label.append("1")
|
622 |
-
flag = 1
|
623 |
-
elif ruiseki+len(i) == j:
|
624 |
-
#print(j)
|
625 |
-
label.append("1")
|
626 |
-
flag = 1
|
627 |
-
if flag == 0:
|
628 |
-
label.append("0")
|
629 |
-
flag = 0
|
630 |
-
ruiseki += len(i)
|
631 |
-
#texts += i + " "
|
632 |
-
|
633 |
-
try:
|
634 |
-
texts += str(fm.vocab[i].index) + " "
|
635 |
-
except KeyError:
|
636 |
-
texts += str(fm.vocab["<unk>"].index) + " "
|
637 |
-
|
638 |
-
label[-1] = "1"
|
639 |
-
texts = texts.rstrip() + "\t"
|
640 |
-
texts += " ".join(label) + "\n"
|
641 |
-
sent = ""
|
642 |
-
sparate = []
|
643 |
-
label = []
|
644 |
-
ruiseki = 0
|
645 |
-
ruiseki2 = 0
|
646 |
-
#print(texts)
|
647 |
-
break
|
648 |
-
if j == "|":
|
649 |
-
sparate.append(n)
|
650 |
-
sent += i
|
651 |
-
with open("alldata.tsv","w")as f:
|
652 |
-
f.write(texts)
|
653 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|