File size: 8,534 Bytes
3dc3966
26a0a47
3dc3966
 
 
 
 
 
 
 
c34afbe
3dc3966
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72953cd
26a0a47
 
 
72953cd
26a0a47
 
 
c34afbe
 
 
3dc3966
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c34afbe
72953cd
 
 
3dc3966
 
 
 
 
 
 
 
 
 
72953cd
3dc3966
 
 
 
 
26a0a47
 
 
3dc3966
 
 
 
 
 
 
 
 
 
c34afbe
 
3dc3966
 
 
c34afbe
 
3dc3966
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
from pathlib import Path
from typing import List, Optional, Tuple

import gradio as gr
import numpy as np
import torch
from sudachipy import dictionary
from sudachipy import tokenizer as sudachi_tokenizer
from transformers import AutoModelForCausalLM, PreTrainedTokenizer, T5Tokenizer


model_dir = Path(__file__).parents[0] / "model"
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
tokenizer = T5Tokenizer.from_pretrained(model_dir)
tokenizer.do_lower_case = True
trained_model = AutoModelForCausalLM.from_pretrained(model_dir)
trained_model.to(device)

# baseline model
baseline_model = AutoModelForCausalLM.from_pretrained("rinna/japanese-gpt2-medium")
baseline_model.to(device)

sudachi_tokenizer_obj = dictionary.Dictionary().create()
mode = sudachi_tokenizer.Tokenizer.SplitMode.C


def sudachi_tokenize(input_text: str) -> List[str]:
    morphemes = sudachi_tokenizer_obj.tokenize(input_text, mode)
    return [morpheme.surface() for morpheme in morphemes]


def calc_offsets(tokens: List[str]) -> List[int]:
    offsets = [0]
    for token in tokens:
        offsets.append(offsets[-1] + len(token))
    return offsets


def distribute_surprisals_to_characters(
    tokens2surprisal: List[Tuple[str, float]]
) -> List[Tuple[str, float]]:
    tokens2surprisal_by_character: List[Tuple[str, float]] = []
    for token, surprisal in tokens2surprisal:
        token_len = len(token)
        for character in token:
            tokens2surprisal_by_character.append((character, surprisal / token_len))
    return tokens2surprisal_by_character


def calculate_surprisals_by_character(
    input_text: str, model: AutoModelForCausalLM, tokenizer: PreTrainedTokenizer
) -> Tuple[float, List[Tuple[str, float]]]:
    input_tokens = [
        token.replace("▁", "")
        for token in tokenizer.tokenize(input_text)
        if token != "▁"
    ]
    input_ids = tokenizer.encode(
        "<s>" + input_text, add_special_tokens=False, return_tensors="pt"
    ).to(device)

    logits = model(input_ids)["logits"].squeeze(0)

    surprisals = []
    for i in range(logits.shape[0] - 1):
        if input_ids[0][i + 1] == 9:
            continue
        logit = logits[i]
        prob = torch.softmax(logit, dim=0)
        neg_logprob = -torch.log(prob)
        surprisals.append(neg_logprob[input_ids[0][i + 1]].item())
    mean_surprisal = np.mean(surprisals)

    tokens2surprisal: List[Tuple[str, float]] = []
    for token, surprisal in zip(input_tokens, surprisals):
        tokens2surprisal.append((token, surprisal))

    char2surprisal = distribute_surprisals_to_characters(tokens2surprisal)

    return mean_surprisal, char2surprisal


def aggregate_surprisals_by_offset(
    char2surprisal: List[Tuple[str, float]], offsets: List[int]
) -> List[Tuple[str, float]]:
    tokens2surprisal = []
    for i in range(len(offsets) - 1):
        start = offsets[i]
        end = offsets[i + 1]
        surprisal = sum([surprisal for _, surprisal in char2surprisal[start:end]])
        token = "".join([char for char, _ in char2surprisal[start:end]])
        tokens2surprisal.append((token, surprisal))

    return tokens2surprisal


def highlight_token(token: str, score: float):
    if score > 0:
        html_color = "#%02X%02X%02X" % (
            255,
            int(255 * (1 - score)),
            int(255 * (1 - score)),
        )
    else:
        html_color = "#%02X%02X%02X" % (
            int(255 * (1 + score)),
            int(255 * (1 + score)),
            255,
        )
    return '<span style="background-color: {}; color: black">{}</span>'.format(
        html_color, token
    )


def create_highlighted_text(
    label: str,
    tokens2scores: List[Tuple[str, float]],
    mean_surprisal: Optional[float] = None,
):
    if mean_surprisal is None:
        highlighted_text = "<h2><b>" + label + "</b></h2>"
    else:
        highlighted_text = (
            "<h2><b>" + label + f"</b>(サプライザル平均値: {mean_surprisal:.3f})</h2>"
        )
    for token, score in tokens2scores:
        highlighted_text += highlight_token(token, score)
    return highlighted_text


def normalize_surprisals(
    tokens2surprisal: List[Tuple[str, float]], log_scale: bool = False
) -> List[Tuple[str, float]]:
    if log_scale:
        surprisals = [np.log(surprisal) for _, surprisal in tokens2surprisal]
    else:
        surprisals = [surprisal for _, surprisal in tokens2surprisal]
    min_surprisal = np.min(surprisals)
    max_surprisal = np.max(surprisals)
    surprisals = [
        (surprisal - min_surprisal) / (max_surprisal - min_surprisal)
        for surprisal in surprisals
    ]
    assert min(surprisals) >= 0
    assert max(surprisals) <= 1
    return [
        (token, surprisal)
        for (token, _), surprisal in zip(tokens2surprisal, surprisals)
    ]


def calculate_surprisal_diff(
    tokens2surprisal: List[Tuple[str, float]],
    baseline_tokens2surprisal: List[Tuple[str, float]],
    scale: float = 100.0,
):
    diff_tokens2surprisal = [
        (token, (surprisal - baseline_surprisal) * 100)
        for (token, surprisal), (_, baseline_surprisal) in zip(
            tokens2surprisal, baseline_tokens2surprisal
        )
    ]
    return diff_tokens2surprisal


def main(input_text: str) -> Tuple[str, str, str]:
    mean_surprisal, char2surprisal = calculate_surprisals_by_character(
        input_text, trained_model, tokenizer
    )
    offsets = calc_offsets(sudachi_tokenize(input_text))
    tokens2surprisal = aggregate_surprisals_by_offset(char2surprisal, offsets)
    tokens2surprisal = normalize_surprisals(tokens2surprisal)

    highlighted_text = create_highlighted_text(
        "学習後モデル", tokens2surprisal, mean_surprisal
    )

    (
        baseline_mean_surprisal,
        baseline_char2surprisal,
    ) = calculate_surprisals_by_character(input_text, baseline_model, tokenizer)
    baseline_tokens2surprisal = aggregate_surprisals_by_offset(
        baseline_char2surprisal, offsets
    )
    baseline_tokens2surprisal = normalize_surprisals(baseline_tokens2surprisal)
    baseline_highlighted_text = create_highlighted_text(
        "学習前モデル", baseline_tokens2surprisal, baseline_mean_surprisal
    )

    diff_tokens2surprisal = calculate_surprisal_diff(
        tokens2surprisal, baseline_tokens2surprisal, 100.0
    )
    diff_highlighted_text = create_highlighted_text(
        "学習前後の差分", diff_tokens2surprisal, None
    )
    return (
        baseline_highlighted_text,
        highlighted_text,
        diff_highlighted_text,
    )


if __name__ == "__main__":
    demo = gr.Interface(
        fn=main,
        title="文章の読みやすさを自動評価するAI",
        description="文章を入力すると、読みづらい表現は赤く、読みやすい表現は青くハイライトされて出力されます。",
        show_label=True,
        inputs=gr.Textbox(
            lines=5,
            label="文章",
            placeholder="ここに文章を入力してください。",
        ),
        outputs=[
            gr.HTML(label="学習前モデル", show_label=True),
            gr.HTML(label="学習後モデル", show_label=True),
            gr.HTML(label="学習前後の差分", show_label=True),
        ],
        examples=[
            "太郎が二郎を殴った。",
            "太郎が二郎に殴った。",
            "サイエンスインパクトラボは、国立研究開発法人科学技術振興機構(JST)の「科学と社会」推進部が行う共創プログラムです。「先端の研究開発を行う研究者」と「社会課題解決に取り組むプレイヤー」が約3ヶ月に渡って共創活動を行います。",
            "近年、ニューラル言語モデルが自然言語の統語知識をどれほど有しているかを、容認性判断課題を通して検証する研究が行われてきている。しかし、このような言語モデルの統語的評価を行うためのデータセットは、主に英語を中心とした欧米の諸言語を対象に構築されてきた。本研究では、既存のデータセットの問題点を克服しつつ、このようなデータセットが構築されてこなかった日本語を対象とした初めてのデータセットである JCoLA (JapaneseCorpus of Linguistic Acceptability) を構築した上で、それを用いた言語モデルの統語的評価を行った。",
        ],
    )

    demo.launch()