File size: 2,429 Bytes
926183f
 
 
 
 
 
 
 
 
46a030d
 
d4c02d7
9ee38ea
89a2a73
 
 
 
 
 
 
 
 
46a030d
e3ce53f
46a030d
 
 
 
 
 
 
 
 
 
 
926183f
46a030d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
926183f
89a2a73
46a030d
 
 
97b1ca5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import re
import pickle
import numpy as np
import random
import torch
from solver import TrainSolver

from model import PointerNetworks
import gensim
import MeCab
import pysbd
import io



class CPU_Unpickler(pickle.Unpickler):
    def find_class(self, module, name):
        if module == 'torch.storage' and name == '_load_from_bytes':
            return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
        else: return super().find_class(module, name)


def create_data(doc,fm,split_method):
    wakati = MeCab.Tagger("-Owakati -b 81920 -r /etc/mecabrc -d /home/user/app/mecab-ipadic-neologd")
    seg = pysbd.Segmenter(language="ja", clean=False)
    texts = []
    sent = ""
    label = []
    alls = []
    labels, text, num = [], [], []
    allab, altex, fukugenss = [], [], []
    for n in range(1):
        fukugens = []
        if split_method == "pySBD":
            lines = seg.segment(doc)
        else:
            doc = doc.strip().replace("。","。\n").replace(".",".\n")
            doc = re.sub("(\n)+","\n",doc)
            lines = doc.split("\n")
        for line in lines:
            line = line.strip()
            if line == "":
                continue
            sent = wakati.parse(line).split(" ")[:-1]
            flag = 0
            label = []
            texts = []
            fukugen = []
            for i in sent:
                try:
                    texts.append(fm.vocab[i].index)
                except KeyError:
                    texts.append(fm.vocab["<unk>"].index)
                fukugen.append(i)
                label.append(0)
            label[-1] = 1
            labels.append(np.array(label))
            text.append(np.array(texts))
            fukugens.append(fukugen)
        allab.append(labels)
        altex.append(text)
        fukugenss.append(fukugens)
        labels, text, fukugens= [], [], []
    return altex, allab, fukugenss


def generate(doc, mymodel, fm, index2word, split_method):
    X_tes, Y_tes, fukugen = create_data(doc,fm,split_method)
    output_texts = mymodel.check_accuracy(X_tes, Y_tes,index2word, fukugen)

    return output_texts



def setup():
    with open('index2word.pickle', 'rb') as f:    
        index2word = pickle.load(f)
    with open('model.pickle', 'rb') as f:
        mysolver = CPU_Unpickler(f).load()
    with open('fm.pickle', 'rb') as f:
        fm = pickle.load(f)
    
    return mysolver,fm,index2word