ando55 commited on
Commit
082fb7f
·
1 Parent(s): aa6ee7a

Delete credata.py

Browse files
Files changed (1) hide show
  1. credata.py +0 -653
credata.py DELETED
@@ -1,653 +0,0 @@
1
- import gensim
2
- import MeCab
3
- import pickle
4
- from gensim.models.wrappers.fasttext import FastText
5
- #import fasttext as ft
6
- import random
7
- import mojimoji
8
- import numpy as np
9
- from tqdm import tqdm
10
-
11
- def ymyi(lis):
12
- wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
13
-
14
- with open('fm_space.pickle', 'rb') as f:
15
- fm = pickle.load(f)
16
- #model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
17
- model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
18
- texts = []
19
- sent = ""
20
- sparate = []
21
- label = []
22
- ruiseki = 0
23
- ruiseki2 = 0
24
- alls = []
25
- labels, text, num = [], [], []
26
- for n, line in enumerate(open(lis)):
27
- line = line.strip("\t").rstrip("\n")
28
- #print(line)
29
- if line == "":
30
- if sent == "":
31
- continue
32
- sent = wakati.parse(sent).split(" ")[:-1]
33
- flag = 0
34
- for i in sent:
35
- for j in sparate:
36
- if ruiseki+len(i) > j and ruiseki < j:
37
- label.append(1)
38
- flag = 1
39
- elif ruiseki+len(i) == j:
40
- label.append(1)
41
- flag = 1
42
- if flag == 0:
43
- label.append(0)
44
- flag = 0
45
- ruiseki += len(i)
46
- #texts += i + " "
47
- try:
48
- texts.append(model[i])
49
- #texts.append(np.array(fm.vocab[i]))
50
- #texts += str(fm.vocab[i].index) + " "
51
- #print(i,str(fm.vocab[i].index))
52
- except KeyError:
53
- texts.append(fm["<unk>"])
54
- label[-1] = 1
55
- #texts = texts.rstrip() + "\t"
56
- #texts += " ".join(label) + "\n"
57
- #alls.append((n,texts,label))
58
- labels.append(label)
59
- text.append(texts)
60
- num.append(n)
61
- sent = ""
62
- sparate = []
63
- texts = []
64
- label = []
65
- ruiseki = 0
66
- ruiseki2 = 0
67
- continue
68
- sent += mojimoji.han_to_zen(line, digit=False, ascii=False)
69
- ruiseki2 += len(line)
70
- sparate.append(ruiseki2)
71
- return num,text,labels
72
-
73
- def nmni(lis):
74
- #wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
75
- wakati = MeCab.Tagger("-Owakati -b 81920")
76
-
77
- with open('fm_space.pickle', 'rb') as f:
78
- fm = pickle.load(f)
79
- #model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
80
- #model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
81
- texts = []
82
- sent = ""
83
- sparate = []
84
- label = []
85
- ruiseki = 0
86
- ruiseki2 = 0
87
- alls = []
88
- labels, text, num = [], [], []
89
- for n, line in enumerate(open(lis)):
90
- line = line.strip("\t").rstrip("\n")
91
- #print(line)
92
- if line == "":
93
- if sent == "":
94
- continue
95
- sent = wakati.parse(sent).split(" ")[:-1]
96
- flag = 0
97
- for i in sent:
98
- for j in sparate:
99
- if ruiseki+len(i) > j and ruiseki < j:
100
- label.append(1)
101
- flag = 1
102
- elif ruiseki+len(i) == j:
103
- label.append(1)
104
- flag = 1
105
- if flag == 0:
106
- label.append(0)
107
- flag = 0
108
- ruiseki += len(i)
109
- #texts += i + " "
110
- try:
111
- #texts.append(model[i])
112
- texts.append(fm[i])
113
- #texts += str(fm.vocab[i].index) + " "
114
- #print(i,str(fm.vocab[i].index))
115
- except KeyError:
116
- texts.append(fm["<unk>"])
117
- label[-1] = 1
118
- #texts = texts.rstrip() + "\t"
119
- #texts += " ".join(label) + "\n"
120
- #alls.append((n,texts,label))
121
- labels.append(label)
122
- text.append(texts)
123
- num.append(n)
124
- sent = ""
125
- sparate = []
126
- texts = []
127
- label = []
128
- ruiseki = 0
129
- ruiseki2 = 0
130
- continue
131
- sent += mojimoji.han_to_zen(line, digit=False, ascii=False)
132
- ruiseki2 += len(line)
133
- sparate.append(ruiseki2)
134
- return num,text,labels
135
-
136
- def nmni_finetune(lis):
137
- #wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
138
- wakati = MeCab.Tagger("-Owakati -b 81920")
139
- #fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False)
140
- with open('fm.pickle', 'rb') as f:
141
- fm = pickle.load(f)
142
- #fm = gensim.models.KeyedVectors.load_word2vec_format('cc.ja.300.vec', binary=False)
143
- #with open('fm.pickle', 'wb') as f:
144
- # pickle.dump(fm, f)
145
- #model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
146
- #model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
147
- texts = []
148
- sent = ""
149
- sparate = []
150
- label = []
151
- ruiseki = 0
152
- ruiseki2 = 0
153
- alls = []
154
- labels, text, num = [], [], []
155
- for n, line in enumerate(open(lis)):
156
- line = line.strip("\t").rstrip("\n")
157
- #print(line)
158
- if line == "":
159
- if sent == "":
160
- continue
161
- sent = wakati.parse(sent).split(" ")[:-1]
162
- flag = 0
163
- for i in sent:
164
- for j in sparate:
165
- if ruiseki+len(i) > j and ruiseki < j:
166
- label.append(1)
167
- flag = 1
168
- elif ruiseki+len(i) == j:
169
- label.append(1)
170
- flag = 1
171
- if flag == 0:
172
- label.append(0)
173
- flag = 0
174
- ruiseki += len(i)
175
- #texts += i + " "
176
- try:
177
- #texts.append(model[i])
178
- #texts.append(fm[i])
179
- texts.append(fm.vocab[i].index)
180
- #print(i,str(fm.vocab[i].index))
181
- except KeyError:
182
- texts.append(fm.vocab["<unk>"].index)
183
- label[-1] = 1
184
- #texts = texts.rstrip() + "\t"
185
- #texts += " ".join(label) + "\n"
186
- #alls.append((n,texts,label))
187
- labels.append(np.array(label))
188
- text.append(np.array(texts))
189
- num.append(n)
190
- sent = ""
191
- sparate = []
192
- texts = []
193
- label = []
194
- ruiseki = 0
195
- ruiseki2 = 0
196
- continue
197
- sent += mojimoji.han_to_zen(line, digit=False, ascii=False)
198
- ruiseki2 += len(line)
199
- sparate.append(ruiseki2)
200
- return text,labels
201
-
202
-
203
-
204
- def nmni_carte(lis):
205
- #wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
206
- wakati = MeCab.Tagger("-Owakati -b 81920")
207
- #fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False)
208
- #fm = gensim.models.KeyedVectors.load_word2vec_format('cc.ja.300.vec', binary=False)
209
- #with open('fm.pickle', 'wb') as f:
210
- # pickle.dump(fm, f)
211
- #model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
212
- #model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
213
- with open('fm.pickle', 'rb') as f:
214
- fm = pickle.load(f)
215
- texts = []
216
- sent = ""
217
- sparate = []
218
- label = []
219
- ruiseki = 0
220
- ruiseki2 = 0
221
- alls = []
222
- labels, text, num = [], [], []
223
- allab, altex, fukugenss = [], [], []
224
- #for n in tqdm(range(26431)):
225
- for n in tqdm(range(108)):
226
- fukugens = []
227
- for line in open(lis+str(n)+".txt"):
228
- line = line.strip()
229
- if line == "":
230
- continue
231
- sent = wakati.parse(line).split(" ")[:-1]
232
- flag = 0
233
- label = []
234
- texts = []
235
- fukugen = []
236
- for i in sent:
237
- try:
238
- texts.append(fm.vocab[i].index)
239
- except KeyError:
240
- texts.append(fm.vocab["<unk>"].index)
241
- fukugen.append(i)
242
- label.append(0)
243
- label[-1] = 1
244
- labels.append(np.array(label))
245
- text.append(np.array(texts))
246
- #labels.append(label)
247
- #text.append(texts)
248
- fukugens.append(fukugen)
249
- allab.append(labels)
250
- altex.append(text)
251
- fukugenss.append(fukugens)
252
- labels, text, fukugens= [], [], []
253
- return altex, allab, fukugenss
254
-
255
-
256
- def nmni_finetune_s(lis):
257
- #wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
258
- wakati = MeCab.Tagger("-Owakati -b 81920")
259
- #fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False)
260
- fm = gensim.models.KeyedVectors.load_word2vec_format('cc.ja.300.vec', binary=False)
261
- with open('fm.pickle', 'wb') as f:
262
- pickle.dump(fm, f)
263
- #model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
264
- #model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
265
- texts = []
266
- sent = ""
267
- sparate = []
268
- label = []
269
- ruiseki = 0
270
- ruiseki2 = 0
271
- alls = []
272
- labels, text, num = [], [], []
273
- for n, line in enumerate(open(lis)):
274
- line = line.strip("\t").rstrip("\n")
275
- sent = wakati.parse(line).split(" ")[:-1]
276
- flag = 0
277
- label = []
278
- texts = []
279
- for i in sent:
280
- try:
281
- texts.append(fm.vocab[i].index)
282
- except KeyError:
283
- texts.append(fm.vocab["<unk>"].index)
284
- label.append(0)
285
- label[-1] = 1
286
- labels.append(np.array(label))
287
- text.append(np.array(texts))
288
- return text,labels
289
-
290
-
291
- def nmni_finetune_ss(lis):
292
- #wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
293
- wakati = MeCab.Tagger("-Owakati -b 81920")
294
- fm = gensim.models.KeyedVectors.load_word2vec_format('cc.ja.300.vec', binary=False)
295
- with open('fm.pickle', 'wb') as f:
296
- pickle.dump(fm, f)
297
- #with open('fm.pickle', 'rb') as f:
298
- # fm = pickle.load(f)
299
- #model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
300
- #model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
301
- t,l =[],[]
302
- for i in range(108):
303
- texts = []
304
- sent = ""
305
- sparate = []
306
- label = []
307
- ruiseki = 0
308
- ruiseki2 = 0
309
- alls = []
310
- labels, text, num = [], [], []
311
- for n, line in enumerate(open(lis+str(i)+".txt")):
312
- line = line.strip("\t").rstrip("\n")
313
- if line == "":
314
- continue
315
- sent = wakati.parse(line).split(" ")[:-1]
316
- flag = 0
317
- label = []
318
- texts = []
319
- for i in sent:
320
- try:
321
- texts.append(fm.vocab[i].index)
322
- except KeyError:
323
- texts.append(fm.vocab["<unk>"].index)
324
- label.append(0)
325
- label[-1] = 1
326
- labels.append(np.array(label))
327
- text.append(np.array(texts))
328
- t.append(text)
329
- l.append(labels)
330
- return t,l
331
-
332
- #model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
333
- #print(model.get_subwords("間質性肺炎"))
334
- #print(model.get_subwords("誤嚥性肺炎"))
335
- #print(model.get_subwords("談話ユニット分割"))
336
-
337
- """
338
- texts = []
339
- sent = ""
340
- sparate = []
341
- label = []
342
- ruiseki = 0
343
- ruiseki2 = 0
344
- alls = []
345
- for n, line in enumerate(open("/clwork/ando/SEGBOT/randomdata.tsv")):
346
- line = line.strip("\t").rstrip("\n")
347
- if line == "":
348
- if sent == "":
349
- continue
350
- alls.append(sent)
351
- sent = ""
352
- continue
353
- else:
354
- sent += line
355
- if len(sent) != 0:
356
- alls.append(sent)
357
- random.shuffle(alls)
358
- #v = random.sample(alls, 300)
359
- #for i in v:
360
- # alls.remove(i)
361
- #t = random.sample(alls, 300)
362
- #for i in t:
363
- # alls.remove(i)
364
- with open("randomdata_concat.tsv","a")as f:
365
- f.write("\n".join())
366
- #with open("dev_fix.tsv","a")as f:
367
- # for i in v:
368
- # f.write("\n".join(i))
369
- # f.write("\n\n")
370
- #with open("test_fix.tsv","a")as f:
371
- # for i in t:
372
- # f.write("\n".join(i))
373
- # f.write("\n\n")
374
- """
375
-
376
- """
377
- out = ""
378
- for line in open("/clwork/ando/SEGBOT_BERT/alldata2_bert.tsv"):
379
- line = line.split("\t")
380
- line = line[0].strip()
381
- if line == "" or "サマリ" in line:
382
- continue
383
- out += line + "\n"
384
- with open("alldata3.tsv","w")as f:
385
- f.write(out)
386
- """
387
- """
388
- #wakati = MeCab.Tagger("-Owakati -b 81920 -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
389
- wakati = MeCab.Tagger("-Owakati -b 81920")
390
-
391
- with open('fm_space.pickle', 'rb') as f:
392
- fm = pickle.load(f)
393
- #model = gensim.models.KeyedVectors.load_word2vec_format("/clwork/ando/SEGBOT/cc.ja.300.vec", binary=False)
394
- #model = ft.load_model("/clwork/ando/SEGBOT/fast/cc.ja.300.bin")
395
- texts = []
396
- sent = ""
397
- sparate = []
398
- label = []
399
- ruiseki = 0
400
- ruiseki2 = 0
401
- alls = []
402
- for n, line in enumerate(open("/clwork/ando/SEGBOT/train_fix.tsv")):
403
- line = line.strip("\t").rstrip("\n")
404
- #print(line)
405
- if line == "":
406
- if sent == "":
407
- continue
408
- sent = wakati.parse(sent).split(" ")[:-1]
409
- flag = 0
410
- for i in sent:
411
- for j in sparate:
412
- if ruiseki+len(i) > j and ruiseki < j:
413
- label.append(1)
414
- flag = 1
415
- elif ruiseki+len(i) == j:
416
- label.append(1)
417
- flag = 1
418
- if flag == 0:
419
- label.append(0)
420
- flag = 0
421
- ruiseki += len(i)
422
- #texts += i + " "
423
- try:
424
- #texts.append(model[i])
425
- texts.append(fm.vocab[i])
426
- #texts += str(fm.vocab[i].index) + " "
427
- #print(i,str(fm.vocab[i].index))
428
- except KeyError:
429
- texts.append(fm.vocab["<unk>"])
430
- print(i)
431
- label[-1] = 1
432
- #texts = texts.rstrip() + "\t"
433
- #texts += " ".join(label) + "\n"
434
- alls.append((str(n),texts,label))
435
- sent = ""
436
- sparate = []
437
- texts = []
438
- label = []
439
- ruiseki = 0
440
- ruiseki2 = 0
441
- continue
442
- sent += mojimoji.han_to_zen(line, digit=False, ascii=False)
443
- ruiseki2 += len(line)
444
- sparate.append(ruiseki2)
445
- with open('nm_ni/train.pickle', 'wb') as f:
446
- pickle.dump(alls, f)
447
- #print(alls)
448
- #with open("resepdata_seped.tsv","w")as f:
449
- # f.write(texts)
450
- """
451
-
452
-
453
-
454
- wakati = MeCab.Tagger("-Owakati")
455
-
456
- #fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False)
457
- #with open('fm.pickle', 'wb') as f:
458
- # pickle.dump(fm, f)
459
- texts = ""
460
- sent = ""
461
- sparate = []
462
- label = []
463
- ruiseki = 0
464
- ruiseki2 = 0
465
- for line in open("alldata.tsv"):
466
- line = line.split("\t")
467
- line = line[0].strip()
468
- if line == "" or "サマリ" in line:
469
- if sent == "":
470
- continue
471
- sent = wakati.parse(sent).split(" ")[:-1]
472
- flag = 0
473
- #print(sent,sparate)
474
- for i in sent:
475
- #print(i)
476
- for j in sparate:
477
- if ruiseki+len(i) > j and ruiseki < j:
478
- #print(j)
479
- label.append("1")
480
- flag = 1
481
- elif ruiseki+len(i) == j:
482
- #print(j)
483
- label.append("1")
484
- flag = 1
485
- if flag == 0:
486
- label.append("0")
487
- flag = 0
488
- ruiseki += len(i)
489
- #texts += i + " "
490
-
491
- try:
492
- texts += str(0) + " "
493
- except KeyError:
494
- print(i)
495
- #texts += str(fm.vocab["<unk>"].index) + " "
496
-
497
- label[-1] = "1"
498
- texts = texts.rstrip() + "\t"
499
- texts += " ".join(label) + "\n"
500
- sent = ""
501
- sparate = []
502
- label = []
503
- ruiseki = 0
504
- ruiseki2 = 0
505
- #print(texts)
506
- continue
507
- sent += line.strip()
508
- ruiseki2 += len(line.strip())
509
- sparate.append(ruiseki2)
510
- with open("random_labbeled.tsv","w")as f:
511
- f.write(texts)
512
-
513
-
514
-
515
-
516
-
517
- """
518
- wakati = MeCab.Tagger("-Owakati -u /clwork/ando/SEGBOT/MANBYO_201907_Dic-utf8.dic")
519
-
520
-
521
- #fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300_space.vec', binary=False)
522
- #with open('fm_space.pickle', 'wb') as f:
523
- # pickle.dump(fm, f)
524
-
525
- with open('fm_space.pickle', 'rb') as f:
526
- fm = pickle.load(f)
527
- texts = ""
528
- sent = ""
529
- sparate = []
530
- label = []
531
- ruiseki = 0
532
- ruiseki2 = 0
533
- for line in open("/clwork/ando/SEGBOT/alldata_resep.tsv"):
534
- line = line.split("\t")
535
- line = line[0].strip("\t").rstrip("\n")
536
- #print(line)
537
- if line == "" or "サマリ" in line:
538
- if sent == "":
539
- continue
540
- print(sent)
541
- sent = sent.replace(" ","<space>")
542
- sent = wakati.parse(sent).split(" ")[:-1]
543
- print(sent)
544
- flag = 0
545
- #print(sent,sparate)
546
- for i in sent:
547
- #print(i)
548
- for j in sparate:
549
- if ruiseki+len(i) > j and ruiseki < j:
550
- #print(j)
551
- label.append("1")
552
- flag = 1
553
- elif ruiseki+len(i) == j:
554
- #print(j)
555
- label.append("1")
556
- flag = 1
557
- if flag == 0:
558
- label.append("0")
559
- flag = 0
560
- ruiseki += len(i)
561
- #texts += i + " "
562
-
563
- try:
564
- texts += str(fm.vocab[i].index) + " "
565
- #print(i,str(fm.vocab[i].index))
566
- except KeyError:
567
- texts += str(fm.vocab["<unk>"].index) + " "
568
- label[-1] = "1"
569
- texts = texts.rstrip() + "\t"
570
- texts += " ".join(label) + "\n"
571
- sent = ""
572
- sparate = []
573
- label = []
574
- ruiseki = 0
575
- ruiseki2 = 0
576
- #print(texts)
577
- continue
578
- sent += line.strip("\t")
579
- ruiseki2 += len(line)
580
- sparate.append(ruiseki2)
581
- with open("alldata2_space.tsv","w")as f:
582
- f.write(texts)
583
- """
584
-
585
-
586
-
587
- """
588
- wakati = MeCab.Tagger("-Owakati")
589
-
590
- fm = gensim.models.KeyedVectors.load_word2vec_format('/clwork/ando/SEGBOT/cc.ja.300.vec', binary=False)
591
- texts = ""
592
- sent = ""
593
- cand = ""
594
- sparate = []
595
- label = []
596
- ruiseki = 0
597
- ruiseki2 = 0
598
- flag2 = 1
599
- for line in open("data2.tsv"):
600
- line = line.split("\t")
601
- if flag2 == 1:
602
- cand = line
603
- flag2 = 2
604
- continue
605
- if flag2 == 2:
606
- flag2 = 1
607
- #print(line,cand)
608
- for n,z in enumerate(zip(cand,line)):
609
- i = z[0]
610
- j = z[1]
611
- n = n+1
612
- if i == "":
613
- sent = wakati.parse(sent).split(" ")[:-1]
614
- flag = 0
615
- #print(sent,sparate)
616
- for i in sent:
617
- #print(i)
618
- for j in sparate:
619
- if ruiseki+len(i) > j and ruiseki < j:
620
- #print(j)
621
- label.append("1")
622
- flag = 1
623
- elif ruiseki+len(i) == j:
624
- #print(j)
625
- label.append("1")
626
- flag = 1
627
- if flag == 0:
628
- label.append("0")
629
- flag = 0
630
- ruiseki += len(i)
631
- #texts += i + " "
632
-
633
- try:
634
- texts += str(fm.vocab[i].index) + " "
635
- except KeyError:
636
- texts += str(fm.vocab["<unk>"].index) + " "
637
-
638
- label[-1] = "1"
639
- texts = texts.rstrip() + "\t"
640
- texts += " ".join(label) + "\n"
641
- sent = ""
642
- sparate = []
643
- label = []
644
- ruiseki = 0
645
- ruiseki2 = 0
646
- #print(texts)
647
- break
648
- if j == "|":
649
- sparate.append(n)
650
- sent += i
651
- with open("alldata.tsv","w")as f:
652
- f.write(texts)
653
- """