File size: 2,032 Bytes
6684a21
 
 
 
88b75d2
6684a21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88b75d2
6684a21
 
8bc03af
6684a21
 
 
 
 
88b75d2
 
 
 
 
 
6684a21
88b75d2
6684a21
 
 
f450fdc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import gradio as gr
from toiro import tokenizers

num_input_lines = 3
default_text = "γ“γ“γ«γƒ†γ‚­γ‚Ήγƒˆγ‚’ε…₯εŠ›γ—γ€Enter γ‚’ζŠΌγ—γ¦γγ γ•γ„γ€‚"
title = "Japanese Tokenizer Comparison"
description = """
This is a demo comparing Japanese tokenizers. You can compare the tokenization results of tools that are available with just a `pip install` in Python.
"""

article = """

# How to install each library

[Janome](https://github.com/mocobeta/janome):
```
pip install janome
```

[nagisa](https://github.com/taishi-i/nagisa):
```
pip install nagisa
```

[sudachi.rs](https://github.com/WorksApplications/sudachi.rs):
```
pip install sudachipy sudachidict_core
```

[mecab-python3](https://github.com/SamuraiT/mecab-python3):
```
pip install mecab-python3
```

[fugashi_ipadic](https://github.com/polm/fugashi):
```
pip install fugashi ipadic
```

[fugashi_ipadic](https://github.com/polm/fugashi):
```
pip install fugashi unidic-lite
```

"""


def tokenize(text):
    words_janome = tokenizers.tokenize_janome(text)
    words_nagisa = tokenizers.tokenize_nagisa(text)
    words_sudachirs = tokenizers.tokenize_sudachipy(text)
    words_mecabpython3 = tokenizers.tokenize_mecab(text)
    words_fugashi_ipadic = tokenizers.tokenize_fugashi_ipadic(text)
    words_fugashi_unidic = tokenizers.tokenize_fugashi_unidic(text)

    return (
        words_janome,
        words_nagisa,
        words_sudachirs,
        words_mecabpython3,
        words_fugashi_ipadic,
        words_fugashi_unidic,
    )


iface = gr.Interface(
    fn=tokenize,
    inputs=gr.Textbox(
        label="Input text",
        lines=num_input_lines,
        value=default_text,
    ),
    title=title,
    description=description,
    article=article,
    outputs=[
        gr.Textbox(label="Janome"),
        gr.Textbox(label="nagisa"),
        gr.Textbox(label="sudachi.rs"),
        gr.Textbox(label="mecab-python3"),
        gr.Textbox(label="fugashi_ipadic"),
        gr.Textbox(label="fugashi_unidic"),
    ],
    live=True,
)


iface.launch()