indiejoseph commited on
Commit
af3d42a
·
0 Parent(s):

first commit

Browse files
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ __pycache__
2
+ pytorch_model.bin
3
+ onnx/*
4
+ .cache
5
+ cmudict.rep
6
+ cmudict_cache.pickle
app.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from infer import OnnxInferenceSession
2
+ from text import cleaned_text_to_sequence, get_bert
3
+ from text.cleaner import clean_text
4
+ import numpy as np
5
+ from huggingface_hub import hf_hub_download
6
+ import asyncio
7
+ from pathlib import Path
8
+
9
+ OnnxSession = None
10
+
11
+ models = [
12
+ {
13
+ "local_path": "./bert/bert-large-cantonese",
14
+ "repo_id": "hon9kon9ize/bert-large-cantonese",
15
+ "files": [
16
+ "pytorch_model.bin"
17
+ ]
18
+ },
19
+ {
20
+ "local_path": "./bert/deberta-v3-large",
21
+ "repo_id": "microsoft/deberta-v3-large",
22
+ "files": [
23
+ "spm.model",
24
+ "pytorch_model.bin"
25
+ ]
26
+ },
27
+ {
28
+ "local_path": "./onnx",
29
+ "repo_id": "hon9kon9ize/bert-vits-zoengjyutgaai-onnx",
30
+ "files": [
31
+ "BertVits2.2PT.json",
32
+ "BertVits2.2PT/BertVits2.2PT_enc_p.onnx",
33
+ "BertVits2.2PT/BertVits2.2PT_emb.onnx",
34
+ "BertVits2.2PT/BertVits2.2PT_dp.onnx",
35
+ "BertVits2.2PT/BertVits2.2PT_sdp.onnx",
36
+ "BertVits2.2PT/BertVits2.2PT_flow.onnx",
37
+ "BertVits2.2PT/BertVits2.2PT_dec.onnx"
38
+ ]
39
+ }
40
+ ]
41
+
42
+ def get_onnx_session():
43
+ global OnnxSession
44
+
45
+ if OnnxSession is not None:
46
+ return OnnxSession
47
+
48
+ OnnxSession = OnnxInferenceSession(
49
+ {
50
+ "enc": "onnx/BertVits2.2PT/BertVits2.2PT_enc_p.onnx",
51
+ "emb_g": "onnx/BertVits2.2PT/BertVits2.2PT_emb.onnx",
52
+ "dp": "onnx/BertVits2.2PT/BertVits2.2PT_dp.onnx",
53
+ "sdp": "onnx/BertVits2.2PT/BertVits2.2PT_sdp.onnx",
54
+ "flow": "onnx/BertVits2.2PT/BertVits2.2PT_flow.onnx",
55
+ "dec": "onnx/BertVits2.2PT/BertVits2.2PT_dec.onnx",
56
+ },
57
+ Providers=["CPUExecutionProvider"],
58
+ )
59
+ return OnnxSession
60
+
61
+ def download_model_files(repo_id, files, local_path):
62
+ for file in files:
63
+ if not Path(local_path).joinpath(file).exists():
64
+ hf_hub_download(
65
+ repo_id, file, local_dir=local_path, local_dir_use_symlinks=False
66
+ )
67
+
68
+ def download_models():
69
+ for data in models:
70
+ download_model_files(data["repo_id"], data["files"], data["local_path"])
71
+
72
+ def intersperse(lst, item):
73
+ result = [item] * (len(lst) * 2 + 1)
74
+ result[1::2] = lst
75
+ return result
76
+
77
+ def get_text(text, language_str, style_text=None, style_weight=0.7):
78
+ style_text = None if style_text == "" else style_text
79
+ # 在此处实现当前版本的get_text
80
+ norm_text, phone, tone, word2ph = clean_text(text, language_str)
81
+ phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
82
+
83
+ # add blank
84
+ phone = intersperse(phone, 0)
85
+ tone = intersperse(tone, 0)
86
+ language = intersperse(language, 0)
87
+ for i in range(len(word2ph)):
88
+ word2ph[i] = word2ph[i] * 2
89
+ word2ph[0] += 1
90
+
91
+ bert_ori = get_bert(
92
+ norm_text, word2ph, language_str, "cpu", style_text, style_weight
93
+ )
94
+ del word2ph
95
+ assert bert_ori.shape[-1] == len(phone), phone
96
+
97
+ if language_str == "EN":
98
+ en_bert = bert_ori
99
+ yue_bert = np.random.randn(1024, len(phone))
100
+ elif language_str == "YUE":
101
+ en_bert = np.random.randn(1024, len(phone))
102
+ yue_bert = bert_ori
103
+ else:
104
+ raise ValueError("language_str should be EN or YUE")
105
+
106
+ assert yue_bert.shape[-1] == len(
107
+ phone
108
+ ), f"Bert seq len {yue_bert.shape[-1]} != {len(phone)}"
109
+
110
+ phone = np.asarray(phone)
111
+ tone = np.asarray(tone)
112
+ language = np.asarray(language)
113
+ en_bert = np.asarray(en_bert.T)
114
+ yue_bert = np.asarray(yue_bert.T)
115
+
116
+ return en_bert, yue_bert, phone, tone, language
117
+
118
+ # Text-to-speech function
119
+ async def text_to_speech(text, sid=0, language="YUE"):
120
+ Session = get_onnx_session()
121
+ if not text.strip():
122
+ return None, gr.Warning("Please enter text to convert.")
123
+ en_bert, yue_bert, x, tone, language = get_text(text, language)
124
+ sid = np.array([sid])
125
+ audio = Session(x, tone, language, en_bert, yue_bert, sid)
126
+
127
+ return audio[0][0]
128
+
129
+
130
+ # Create Gradio application
131
+ import gradio as gr
132
+
133
+ # Gradio interface function
134
+ def tts_interface(text):
135
+ audio = asyncio.run(text_to_speech(text, 0, "YUE"))
136
+ return 44100, audio
137
+
138
+ async def create_demo():
139
+ description = """廣東話語音生成器,基於Bert-VITS2模型
140
+
141
+ 注意:model 本身支持廣東話同英文,但呢個 space 未實現中英夾雜生成。
142
+ """
143
+
144
+ demo = gr.Interface(
145
+ fn=tts_interface,
146
+ inputs=[
147
+ gr.Textbox(label="Input Text", lines=5),
148
+ ],
149
+ outputs=[
150
+ gr.Audio(label="Generated Audio"),
151
+ ],
152
+ title="Cantonese TTS Text-to-Speech",
153
+ description=description,
154
+ analytics_enabled=False,
155
+ allow_flagging=False
156
+ )
157
+ return demo
158
+
159
+
160
+ # Run the application
161
+ if __name__ == "__main__":
162
+ download_models()
163
+
164
+ demo = asyncio.run(create_demo())
165
+ demo.launch()
bert/bert-large-cantonese/README.md ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ language:
4
+ - yue
5
+ license: cc-by-4.0
6
+ tags:
7
+ - generated_from_trainer
8
+ pipeline_tag: fill-mask
9
+ widget:
10
+ - text: 香港原本[MASK]一個人煙稀少嘅漁港。
11
+ example_title: 係
12
+ model-index:
13
+ - name: bert-large-cantonese
14
+ results: []
15
+ ---
16
+
17
+ # bert-large-cantonese
18
+
19
+ ## Description
20
+
21
+ This model is tranied from scratch on Cantonese text. It is a BERT model with a large architecture (24-layer, 1024-hidden, 16-heads, 326M parameters).
22
+
23
+ The first training stage is to pre-train the model on 128 length sequences with a batch size of 512 for 1 epoch. the second stage is to continued pre-train the model on 512 length sequences with a batch size of 512 for one more epoch.
24
+
25
+ ## How to use
26
+
27
+ You can use this model directly with a pipeline for masked language modeling:
28
+
29
+ ```python
30
+ from transformers import pipeline
31
+
32
+ mask_filler = pipeline(
33
+ "fill-mask",
34
+ model="hon9kon9ize/bert-large-cantonese"
35
+ )
36
+
37
+ mask_filler("雞蛋六隻,糖呢就兩茶匙,仲有[MASK]橙皮添。")
38
+
39
+ ; [{'score': 0.08160534501075745,
40
+ ; 'token': 943,
41
+ ; 'token_str': '個',
42
+ ; 'sequence': '雞 蛋 六 隻 , 糖 呢 就 兩 茶 匙 , 仲 有 個 橙 皮 添 。'},
43
+ ; {'score': 0.06182105466723442,
44
+ ; 'token': 1576,
45
+ ; 'token_str': '啲',
46
+ ; 'sequence': '雞 蛋 六 隻 , 糖 呢 就 兩 茶 匙 , 仲 有 啲 橙 皮 添 。'},
47
+ ; {'score': 0.04600336775183678,
48
+ ; 'token': 1646,
49
+ ; 'token_str': '嘅',
50
+ ; 'sequence': '雞 蛋 六 隻 , 糖 呢 就 兩 茶 匙 , 仲 有 嘅 橙 皮 添 。'},
51
+ ; {'score': 0.03743772581219673,
52
+ ; 'token': 3581,
53
+ ; 'token_str': '橙',
54
+ ; 'sequence': '雞 蛋 六 隻 , 糖 呢 就 兩 茶 匙 , 仲 有 橙 橙 皮 添 。'},
55
+ ; {'score': 0.031560592353343964,
56
+ ; 'token': 5148,
57
+ ; 'token_str': '紅',
58
+ ; 'sequence': '雞 蛋 六 隻 , 糖 呢 就 兩 茶 匙 , 仲 有 紅 橙 皮 添 。'}]
59
+ ```
60
+
61
+ ## Training hyperparameters
62
+
63
+ The following hyperparameters were used during first training:
64
+
65
+ - Batch size: 512
66
+ - Learning rate: 1e-4
67
+ - Learning rate scheduler: linear decay
68
+ - 1 Epoch
69
+ - Warmup ratio: 0.1
70
+
71
+ Loss plot on [WanDB](https://api.wandb.ai/links/indiejoseph/v3ljlpmp)
72
+
73
+ The following hyperparameters were used during second training:
74
+
75
+ - Batch size: 512
76
+ - Learning rate: 5e-5
77
+ - Learning rate scheduler: linear decay
78
+ - 1 Epoch
79
+ - Warmup ratio: 0.1
80
+
81
+ Loss plot on [WanDB](https://api.wandb.ai/links/indiejoseph/vcm3q1ef)
82
+
bert/bert-large-cantonese/added_tokens.json ADDED
@@ -0,0 +1,502 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "㔷": 21620,
3
+ "㖭": 21330,
4
+ "㚻": 21255,
5
+ "㞗": 21216,
6
+ "㞘": 21384,
7
+ "㦸": 21493,
8
+ "㨂": 21307,
9
+ "㩒": 21182,
10
+ "㴓": 21485,
11
+ "㷫": 21265,
12
+ "乸": 21143,
13
+ "仼": 21501,
14
+ "佮": 21234,
15
+ "侘": 21537,
16
+ "偲": 21220,
17
+ "僆": 21421,
18
+ "僞": 21471,
19
+ "僳": 21564,
20
+ "儁": 21422,
21
+ "儍": 21388,
22
+ "兗": 21368,
23
+ "冚": 21138,
24
+ "冧": 21137,
25
+ "凖": 21454,
26
+ "勷": 21522,
27
+ "卌": 21284,
28
+ "卽": 21186,
29
+ "厏": 21439,
30
+ "厓": 21449,
31
+ "厠": 21256,
32
+ "厹": 21285,
33
+ "吔": 21205,
34
+ "吲": 21403,
35
+ "吿": 21547,
36
+ "呑": 21331,
37
+ "呔": 21204,
38
+ "咃": 21533,
39
+ "咇": 21300,
40
+ "咼": 21565,
41
+ "哚": 21376,
42
+ "唂": 21402,
43
+ "唒": 21250,
44
+ "唓": 21401,
45
+ "唞": 21175,
46
+ "唥": 21144,
47
+ "唨": 21159,
48
+ "唪": 21146,
49
+ "唻": 21223,
50
+ "啋": 21428,
51
+ "啩": 21178,
52
+ "啹": 21482,
53
+ "喐": 21165,
54
+ "喥": 21316,
55
+ "喼": 21192,
56
+ "嗌": 21129,
57
+ "嗮": 21130,
58
+ "嗱": 21145,
59
+ "嘥": 21151,
60
+ "噃": 21197,
61
+ "噉": 21128,
62
+ "噏": 21170,
63
+ "噚": 21135,
64
+ "嚙": 21282,
65
+ "嚡": 21236,
66
+ "嚦": 21455,
67
+ "嚫": 21346,
68
+ "嚹": 21158,
69
+ "嚿": 21134,
70
+ "囇": 21612,
71
+ "囖": 21140,
72
+ "囘": 21504,
73
+ "坭": 21315,
74
+ "垻": 21538,
75
+ "埐": 21294,
76
+ "埞": 21180,
77
+ "埲": 21288,
78
+ "堊": 21309,
79
+ "塡": 21511,
80
+ "塱": 21187,
81
+ "塲": 21445,
82
+ "塹": 21481,
83
+ "奀": 21306,
84
+ "奭": 21492,
85
+ "妺": 21465,
86
+ "姵": 21536,
87
+ "娸": 21569,
88
+ "媺": 21431,
89
+ "嫗": 21311,
90
+ "嫰": 21323,
91
+ "嬋": 21400,
92
+ "嬲": 21131,
93
+ "孭": 21179,
94
+ "孲": 21210,
95
+ "孻": 21264,
96
+ "尐": 21157,
97
+ "尙": 21520,
98
+ "尢": 21619,
99
+ "屘": 21484,
100
+ "屙": 21160,
101
+ "岃": 21392,
102
+ "嶠": 21267,
103
+ "幗": 21269,
104
+ "幪": 21279,
105
+ "廡": 21530,
106
+ "廸": 21217,
107
+ "廻": 21479,
108
+ "彊": 21446,
109
+ "彖": 21335,
110
+ "徂": 21155,
111
+ "忟": 21301,
112
+ "惗": 21353,
113
+ "愃": 21527,
114
+ "愨": 21562,
115
+ "慇": 21603,
116
+ "慤": 21389,
117
+ "憓": 21477,
118
+ "戇": 21181,
119
+ "戙": 21281,
120
+ "戥": 21162,
121
+ "扤": 21541,
122
+ "扲": 21549,
123
+ "扺": 21293,
124
+ "抆": 21266,
125
+ "抌": 21258,
126
+ "抺": 21238,
127
+ "拃": 21188,
128
+ "拏": 21271,
129
+ "拕": 21476,
130
+ "挐": 21524,
131
+ "捽": 21212,
132
+ "掕": 21166,
133
+ "掗": 21486,
134
+ "掟": 21153,
135
+ "掹": 21214,
136
+ "揈": 21251,
137
+ "揞": 21429,
138
+ "揦": 21371,
139
+ "揼": 21184,
140
+ "揾": 21132,
141
+ "搣": 21222,
142
+ "搦": 21383,
143
+ "搲": 21317,
144
+ "搾": 21398,
145
+ "摑": 21268,
146
+ "摱": 21438,
147
+ "摷": 21209,
148
+ "撘": 21224,
149
+ "撣": 21615,
150
+ "撳": 21141,
151
+ "撾": 21183,
152
+ "擗": 21589,
153
+ "擧": 21521,
154
+ "擸": 21334,
155
+ "攆": 21544,
156
+ "攰": 21139,
157
+ "攷": 21270,
158
+ "旚": 21582,
159
+ "旯": 21280,
160
+ "旼": 21399,
161
+ "昃": 21483,
162
+ "昅": 21528,
163
+ "昪": 21377,
164
+ "昰": 21459,
165
+ "昺": 21380,
166
+ "暎": 21558,
167
+ "暪": 21437,
168
+ "曱": 21185,
169
+ "朏": 21557,
170
+ "朳": 21572,
171
+ "柙": 21551,
172
+ "栢": 21193,
173
+ "栱": 21581,
174
+ "梘": 21219,
175
+ "椏": 21385,
176
+ "椗": 21618,
177
+ "榘": 21560,
178
+ "榚": 21369,
179
+ "樋": 21601,
180
+ "樖": 21150,
181
+ "樨": 21475,
182
+ "樴": 21413,
183
+ "橛": 21156,
184
+ "檠": 21272,
185
+ "櫈": 21173,
186
+ "櫟": 21516,
187
+ "櫳": 21215,
188
+ "欏": 21500,
189
+ "殮": 21295,
190
+ "殻": 21207,
191
+ "氘": 21616,
192
+ "氚": 21574,
193
+ "氬": 21447,
194
+ "氼": 21329,
195
+ "沊": 21509,
196
+ "沔": 21552,
197
+ "沚": 21490,
198
+ "泂": 21461,
199
+ "涷": 21340,
200
+ "淥": 21235,
201
+ "淸": 21363,
202
+ "湉": 21443,
203
+ "湞": 21626,
204
+ "湴": 21407,
205
+ "滘": 21161,
206
+ "漖": 21627,
207
+ "潁": 21396,
208
+ "潯": 21241,
209
+ "澌": 21292,
210
+ "濰": 21394,
211
+ "濶": 21468,
212
+ "瀡": 21435,
213
+ "瀦": 21535,
214
+ "灃": 21625,
215
+ "灕": 21420,
216
+ "炆": 21172,
217
+ "炑": 21474,
218
+ "炘": 21621,
219
+ "烚": 21189,
220
+ "烴": 21229,
221
+ "焫": 21248,
222
+ "煇": 21227,
223
+ "煬": 21247,
224
+ "煱": 21347,
225
+ "燶": 21163,
226
+ "燾": 21386,
227
+ "牀": 21168,
228
+ "牘": 21600,
229
+ "猁": 21226,
230
+ "猢": 21609,
231
+ "猻": 21540,
232
+ "獌": 21198,
233
+ "獴": 21415,
234
+ "珓": 21370,
235
+ "琚": 21597,
236
+ "琤": 21393,
237
+ "琿": 21494,
238
+ "瑂": 21423,
239
+ "瑭": 21573,
240
+ "璘": 21555,
241
+ "璠": 21240,
242
+ "璣": 21299,
243
+ "璦": 21556,
244
+ "璩": 21508,
245
+ "瓘": 21554,
246
+ "瓚": 21318,
247
+ "甂": 21457,
248
+ "甑": 21539,
249
+ "甴": 21190,
250
+ "畧": 21322,
251
+ "畵": 21416,
252
+ "疎": 21460,
253
+ "疴": 21338,
254
+ "痲": 21576,
255
+ "痾": 21164,
256
+ "癆": 21503,
257
+ "癈": 21333,
258
+ "癗": 21433,
259
+ "癦": 21610,
260
+ "癩": 21410,
261
+ "睺": 21296,
262
+ "砬": 21568,
263
+ "砵": 21194,
264
+ "硃": 21570,
265
+ "硏": 21342,
266
+ "硤": 21201,
267
+ "礮": 21375,
268
+ "祆": 21472,
269
+ "祼": 21417,
270
+ "禕": 21542,
271
+ "禰": 21514,
272
+ "稈": 21367,
273
+ "穏": 21341,
274
+ "窰": 21230,
275
+ "竈": 21286,
276
+ "竉": 21289,
277
+ "竪": 21550,
278
+ "笪": 21147,
279
+ "筧": 21605,
280
+ "篋": 21359,
281
+ "簋": 21277,
282
+ "簒": 21418,
283
+ "粢": 21586,
284
+ "糉": 21263,
285
+ "糭": 21253,
286
+ "糴": 21425,
287
+ "紇": 21470,
288
+ "紥": 21252,
289
+ "綉": 21575,
290
+ "綟": 21260,
291
+ "綣": 21512,
292
+ "綷": 21441,
293
+ "緡": 21245,
294
+ "緲": 21517,
295
+ "縉": 21297,
296
+ "縹": 21587,
297
+ "繑": 21448,
298
+ "繙": 21246,
299
+ "缐": 21553,
300
+ "罅": 21191,
301
+ "罉": 21430,
302
+ "罟": 21324,
303
+ "羕": 21507,
304
+ "羶": 21378,
305
+ "翕": 21456,
306
+ "耖": 21390,
307
+ "肶": 21351,
308
+ "胐": 21332,
309
+ "脧": 21303,
310
+ "脷": 21148,
311
+ "腍": 21167,
312
+ "膥": 21228,
313
+ "膶": 21257,
314
+ "臏": 21566,
315
+ "舘": 21374,
316
+ "舢": 21563,
317
+ "舨": 21592,
318
+ "艶": 21593,
319
+ "苺": 21488,
320
+ "茘": 21624,
321
+ "菴": 21312,
322
+ "蒴": 21343,
323
+ "蓀": 21458,
324
+ "蔴": 21177,
325
+ "蕓": 21518,
326
+ "藪": 21302,
327
+ "蘄": 21613,
328
+ "蘅": 21478,
329
+ "蚧": 21579,
330
+ "蛺": 21391,
331
+ "蜑": 21358,
332
+ "蝻": 21259,
333
+ "螈": 21291,
334
+ "蟈": 21419,
335
+ "蟧": 21360,
336
+ "蟶": 21233,
337
+ "蠄": 21326,
338
+ "蠏": 21467,
339
+ "蠑": 21328,
340
+ "衊": 21469,
341
+ "裇": 21304,
342
+ "褦": 21221,
343
+ "褸": 21171,
344
+ "覈": 21290,
345
+ "覲": 21453,
346
+ "觜": 21496,
347
+ "訃": 21571,
348
+ "訌": 21412,
349
+ "訢": 21466,
350
+ "詏": 21244,
351
+ "詒": 21531,
352
+ "誒": 21152,
353
+ "謖": 21473,
354
+ "謚": 21237,
355
+ "謳": 21278,
356
+ "谿": 21580,
357
+ "豕": 21491,
358
+ "趷": 21424,
359
+ "跣": 21206,
360
+ "踎": 21202,
361
+ "踭": 21203,
362
+ "踼": 21404,
363
+ "躂": 21426,
364
+ "躄": 21195,
365
+ "躝": 21274,
366
+ "軚": 21196,
367
+ "軛": 21357,
368
+ "軫": 21349,
369
+ "軭": 21497,
370
+ "軻": 21434,
371
+ "輋": 21239,
372
+ "迾": 21325,
373
+ "逄": 21594,
374
+ "逑": 21611,
375
+ "逳": 21211,
376
+ "邴": 21310,
377
+ "郃": 21604,
378
+ "鄕": 21406,
379
+ "鄴": 21287,
380
+ "酎": 21546,
381
+ "釙": 21450,
382
+ "鈷": 21548,
383
+ "鈹": 21545,
384
+ "鉍": 21584,
385
+ "鉞": 21525,
386
+ "鉬": 21588,
387
+ "鉸": 21308,
388
+ "鉼": 21387,
389
+ "銥": 21462,
390
+ "銨": 21365,
391
+ "銫": 21440,
392
+ "銻": 21532,
393
+ "銼": 21432,
394
+ "鋇": 21614,
395
+ "鋯": 21590,
396
+ "錀": 21499,
397
+ "錒": 21498,
398
+ "錕": 21372,
399
+ "錡": 21199,
400
+ "鍔": 21515,
401
+ "鍚": 21273,
402
+ "鍠": 21261,
403
+ "鍬": 21379,
404
+ "鍶": 21344,
405
+ "鎅": 21339,
406
+ "鎘": 21529,
407
+ "鎢": 21362,
408
+ "鏇": 21397,
409
+ "鏐": 21585,
410
+ "鏝": 21218,
411
+ "鏵": 21444,
412
+ "鏹": 21502,
413
+ "鐡": 21505,
414
+ "鑌": 21452,
415
+ "鑭": 21567,
416
+ "閂": 21154,
417
+ "閆": 21463,
418
+ "閙": 21366,
419
+ "閤": 21381,
420
+ "閪": 21200,
421
+ "閭": 21409,
422
+ "闐": 21591,
423
+ "闓": 21442,
424
+ "靑": 21405,
425
+ "靭": 21262,
426
+ "靱": 21599,
427
+ "韃": 21242,
428
+ "韞": 21354,
429
+ "韮": 21254,
430
+ "頊": 21348,
431
+ "頴": 21327,
432
+ "顓": 21427,
433
+ "顥": 21411,
434
+ "顳": 21319,
435
+ "飮": 21561,
436
+ "餬": 21276,
437
+ "餸": 21133,
438
+ "饀": 21414,
439
+ "饉": 21534,
440
+ "馱": 21480,
441
+ "駖": 21356,
442
+ "駙": 21617,
443
+ "駟": 21489,
444
+ "駡": 21598,
445
+ "騫": 21314,
446
+ "騭": 21364,
447
+ "騮": 21174,
448
+ "騾": 21487,
449
+ "驃": 21336,
450
+ "驄": 21337,
451
+ "骹": 21355,
452
+ "髀": 21142,
453
+ "髹": 21350,
454
+ "鬅": 21213,
455
+ "鬈": 21408,
456
+ "鬩": 21543,
457
+ "鬭": 21495,
458
+ "鬲": 21320,
459
+ "魨": 21464,
460
+ "鮋": 21596,
461
+ "鮟": 21232,
462
+ "鮫": 21352,
463
+ "鯇": 21305,
464
+ "鯡": 21623,
465
+ "鯪": 21243,
466
+ "鯭": 21382,
467
+ "鰂": 21169,
468
+ "鰹": 21345,
469
+ "鱇": 21231,
470
+ "鱘": 21395,
471
+ "鱟": 21583,
472
+ "鱲": 21208,
473
+ "鳯": 21451,
474
+ "鴞": 21275,
475
+ "鴣": 21607,
476
+ "鴴": 21622,
477
+ "鵐": 21321,
478
+ "鵞": 21510,
479
+ "鵪": 21283,
480
+ "鶉": 21249,
481
+ "鶻": 21608,
482
+ "鶿": 21559,
483
+ "鷂": 21298,
484
+ "鷄": 21176,
485
+ "鷓": 21606,
486
+ "鷸": 21373,
487
+ "鸕": 21526,
488
+ "鸛": 21361,
489
+ "麪": 21149,
490
+ "麫": 21577,
491
+ "麿": 21225,
492
+ "黐": 21136,
493
+ "鼆": 21313,
494
+ "鼇": 21602,
495
+ "鼴": 21436,
496
+ "鼷": 21506,
497
+ "齲": 21519,
498
+ "齶": 21578,
499
+ "龑": 21523,
500
+ "龠": 21595,
501
+ "龢": 21513
502
+ }
bert/bert-large-cantonese/config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.1,
9
+ "hidden_size": 1024,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 4096,
12
+ "layer_norm_eps": 1e-12,
13
+ "max_position_embeddings": 512,
14
+ "model_type": "bert",
15
+ "num_attention_heads": 16,
16
+ "num_hidden_layers": 24,
17
+ "pad_token_id": 0,
18
+ "position_embedding_type": "absolute",
19
+ "torch_dtype": "float32",
20
+ "transformers_version": "4.40.1",
21
+ "type_vocab_size": 2,
22
+ "use_cache": true,
23
+ "vocab_size": 21628
24
+ }
bert/bert-large-cantonese/generation_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "pad_token_id": 0,
4
+ "transformers_version": "4.40.1"
5
+ }
bert/bert-large-cantonese/special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
bert/bert-large-cantonese/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
bert/bert-large-cantonese/tokenizer_config.json ADDED
@@ -0,0 +1,4062 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "21128": {
44
+ "content": "噉",
45
+ "lstrip": false,
46
+ "normalized": true,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": false
50
+ },
51
+ "21129": {
52
+ "content": "嗌",
53
+ "lstrip": false,
54
+ "normalized": true,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": false
58
+ },
59
+ "21130": {
60
+ "content": "嗮",
61
+ "lstrip": false,
62
+ "normalized": true,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": false
66
+ },
67
+ "21131": {
68
+ "content": "嬲",
69
+ "lstrip": false,
70
+ "normalized": true,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": false
74
+ },
75
+ "21132": {
76
+ "content": "揾",
77
+ "lstrip": false,
78
+ "normalized": true,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": false
82
+ },
83
+ "21133": {
84
+ "content": "餸",
85
+ "lstrip": false,
86
+ "normalized": true,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": false
90
+ },
91
+ "21134": {
92
+ "content": "嚿",
93
+ "lstrip": false,
94
+ "normalized": true,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": false
98
+ },
99
+ "21135": {
100
+ "content": "噚",
101
+ "lstrip": false,
102
+ "normalized": true,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": false
106
+ },
107
+ "21136": {
108
+ "content": "黐",
109
+ "lstrip": false,
110
+ "normalized": true,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": false
114
+ },
115
+ "21137": {
116
+ "content": "冧",
117
+ "lstrip": false,
118
+ "normalized": true,
119
+ "rstrip": false,
120
+ "single_word": false,
121
+ "special": false
122
+ },
123
+ "21138": {
124
+ "content": "冚",
125
+ "lstrip": false,
126
+ "normalized": true,
127
+ "rstrip": false,
128
+ "single_word": false,
129
+ "special": false
130
+ },
131
+ "21139": {
132
+ "content": "攰",
133
+ "lstrip": false,
134
+ "normalized": true,
135
+ "rstrip": false,
136
+ "single_word": false,
137
+ "special": false
138
+ },
139
+ "21140": {
140
+ "content": "囖",
141
+ "lstrip": false,
142
+ "normalized": true,
143
+ "rstrip": false,
144
+ "single_word": false,
145
+ "special": false
146
+ },
147
+ "21141": {
148
+ "content": "撳",
149
+ "lstrip": false,
150
+ "normalized": true,
151
+ "rstrip": false,
152
+ "single_word": false,
153
+ "special": false
154
+ },
155
+ "21142": {
156
+ "content": "髀",
157
+ "lstrip": false,
158
+ "normalized": true,
159
+ "rstrip": false,
160
+ "single_word": false,
161
+ "special": false
162
+ },
163
+ "21143": {
164
+ "content": "乸",
165
+ "lstrip": false,
166
+ "normalized": true,
167
+ "rstrip": false,
168
+ "single_word": false,
169
+ "special": false
170
+ },
171
+ "21144": {
172
+ "content": "唥",
173
+ "lstrip": false,
174
+ "normalized": true,
175
+ "rstrip": false,
176
+ "single_word": false,
177
+ "special": false
178
+ },
179
+ "21145": {
180
+ "content": "嗱",
181
+ "lstrip": false,
182
+ "normalized": true,
183
+ "rstrip": false,
184
+ "single_word": false,
185
+ "special": false
186
+ },
187
+ "21146": {
188
+ "content": "唪",
189
+ "lstrip": false,
190
+ "normalized": true,
191
+ "rstrip": false,
192
+ "single_word": false,
193
+ "special": false
194
+ },
195
+ "21147": {
196
+ "content": "笪",
197
+ "lstrip": false,
198
+ "normalized": true,
199
+ "rstrip": false,
200
+ "single_word": false,
201
+ "special": false
202
+ },
203
+ "21148": {
204
+ "content": "脷",
205
+ "lstrip": false,
206
+ "normalized": true,
207
+ "rstrip": false,
208
+ "single_word": false,
209
+ "special": false
210
+ },
211
+ "21149": {
212
+ "content": "麪",
213
+ "lstrip": false,
214
+ "normalized": true,
215
+ "rstrip": false,
216
+ "single_word": false,
217
+ "special": false
218
+ },
219
+ "21150": {
220
+ "content": "樖",
221
+ "lstrip": false,
222
+ "normalized": true,
223
+ "rstrip": false,
224
+ "single_word": false,
225
+ "special": false
226
+ },
227
+ "21151": {
228
+ "content": "嘥",
229
+ "lstrip": false,
230
+ "normalized": true,
231
+ "rstrip": false,
232
+ "single_word": false,
233
+ "special": false
234
+ },
235
+ "21152": {
236
+ "content": "誒",
237
+ "lstrip": false,
238
+ "normalized": true,
239
+ "rstrip": false,
240
+ "single_word": false,
241
+ "special": false
242
+ },
243
+ "21153": {
244
+ "content": "掟",
245
+ "lstrip": false,
246
+ "normalized": true,
247
+ "rstrip": false,
248
+ "single_word": false,
249
+ "special": false
250
+ },
251
+ "21154": {
252
+ "content": "閂",
253
+ "lstrip": false,
254
+ "normalized": true,
255
+ "rstrip": false,
256
+ "single_word": false,
257
+ "special": false
258
+ },
259
+ "21155": {
260
+ "content": "徂",
261
+ "lstrip": false,
262
+ "normalized": true,
263
+ "rstrip": false,
264
+ "single_word": false,
265
+ "special": false
266
+ },
267
+ "21156": {
268
+ "content": "橛",
269
+ "lstrip": false,
270
+ "normalized": true,
271
+ "rstrip": false,
272
+ "single_word": false,
273
+ "special": false
274
+ },
275
+ "21157": {
276
+ "content": "尐",
277
+ "lstrip": false,
278
+ "normalized": true,
279
+ "rstrip": false,
280
+ "single_word": false,
281
+ "special": false
282
+ },
283
+ "21158": {
284
+ "content": "嚹",
285
+ "lstrip": false,
286
+ "normalized": true,
287
+ "rstrip": false,
288
+ "single_word": false,
289
+ "special": false
290
+ },
291
+ "21159": {
292
+ "content": "唨",
293
+ "lstrip": false,
294
+ "normalized": true,
295
+ "rstrip": false,
296
+ "single_word": false,
297
+ "special": false
298
+ },
299
+ "21160": {
300
+ "content": "屙",
301
+ "lstrip": false,
302
+ "normalized": true,
303
+ "rstrip": false,
304
+ "single_word": false,
305
+ "special": false
306
+ },
307
+ "21161": {
308
+ "content": "滘",
309
+ "lstrip": false,
310
+ "normalized": true,
311
+ "rstrip": false,
312
+ "single_word": false,
313
+ "special": false
314
+ },
315
+ "21162": {
316
+ "content": "戥",
317
+ "lstrip": false,
318
+ "normalized": true,
319
+ "rstrip": false,
320
+ "single_word": false,
321
+ "special": false
322
+ },
323
+ "21163": {
324
+ "content": "燶",
325
+ "lstrip": false,
326
+ "normalized": true,
327
+ "rstrip": false,
328
+ "single_word": false,
329
+ "special": false
330
+ },
331
+ "21164": {
332
+ "content": "痾",
333
+ "lstrip": false,
334
+ "normalized": true,
335
+ "rstrip": false,
336
+ "single_word": false,
337
+ "special": false
338
+ },
339
+ "21165": {
340
+ "content": "喐",
341
+ "lstrip": false,
342
+ "normalized": true,
343
+ "rstrip": false,
344
+ "single_word": false,
345
+ "special": false
346
+ },
347
+ "21166": {
348
+ "content": "掕",
349
+ "lstrip": false,
350
+ "normalized": true,
351
+ "rstrip": false,
352
+ "single_word": false,
353
+ "special": false
354
+ },
355
+ "21167": {
356
+ "content": "腍",
357
+ "lstrip": false,
358
+ "normalized": true,
359
+ "rstrip": false,
360
+ "single_word": false,
361
+ "special": false
362
+ },
363
+ "21168": {
364
+ "content": "牀",
365
+ "lstrip": false,
366
+ "normalized": true,
367
+ "rstrip": false,
368
+ "single_word": false,
369
+ "special": false
370
+ },
371
+ "21169": {
372
+ "content": "鰂",
373
+ "lstrip": false,
374
+ "normalized": true,
375
+ "rstrip": false,
376
+ "single_word": false,
377
+ "special": false
378
+ },
379
+ "21170": {
380
+ "content": "噏",
381
+ "lstrip": false,
382
+ "normalized": true,
383
+ "rstrip": false,
384
+ "single_word": false,
385
+ "special": false
386
+ },
387
+ "21171": {
388
+ "content": "褸",
389
+ "lstrip": false,
390
+ "normalized": true,
391
+ "rstrip": false,
392
+ "single_word": false,
393
+ "special": false
394
+ },
395
+ "21172": {
396
+ "content": "炆",
397
+ "lstrip": false,
398
+ "normalized": true,
399
+ "rstrip": false,
400
+ "single_word": false,
401
+ "special": false
402
+ },
403
+ "21173": {
404
+ "content": "櫈",
405
+ "lstrip": false,
406
+ "normalized": true,
407
+ "rstrip": false,
408
+ "single_word": false,
409
+ "special": false
410
+ },
411
+ "21174": {
412
+ "content": "騮",
413
+ "lstrip": false,
414
+ "normalized": true,
415
+ "rstrip": false,
416
+ "single_word": false,
417
+ "special": false
418
+ },
419
+ "21175": {
420
+ "content": "唞",
421
+ "lstrip": false,
422
+ "normalized": true,
423
+ "rstrip": false,
424
+ "single_word": false,
425
+ "special": false
426
+ },
427
+ "21176": {
428
+ "content": "鷄",
429
+ "lstrip": false,
430
+ "normalized": true,
431
+ "rstrip": false,
432
+ "single_word": false,
433
+ "special": false
434
+ },
435
+ "21177": {
436
+ "content": "蔴",
437
+ "lstrip": false,
438
+ "normalized": true,
439
+ "rstrip": false,
440
+ "single_word": false,
441
+ "special": false
442
+ },
443
+ "21178": {
444
+ "content": "啩",
445
+ "lstrip": false,
446
+ "normalized": true,
447
+ "rstrip": false,
448
+ "single_word": false,
449
+ "special": false
450
+ },
451
+ "21179": {
452
+ "content": "孭",
453
+ "lstrip": false,
454
+ "normalized": true,
455
+ "rstrip": false,
456
+ "single_word": false,
457
+ "special": false
458
+ },
459
+ "21180": {
460
+ "content": "埞",
461
+ "lstrip": false,
462
+ "normalized": true,
463
+ "rstrip": false,
464
+ "single_word": false,
465
+ "special": false
466
+ },
467
+ "21181": {
468
+ "content": "戇",
469
+ "lstrip": false,
470
+ "normalized": true,
471
+ "rstrip": false,
472
+ "single_word": false,
473
+ "special": false
474
+ },
475
+ "21182": {
476
+ "content": "㩒",
477
+ "lstrip": false,
478
+ "normalized": true,
479
+ "rstrip": false,
480
+ "single_word": false,
481
+ "special": false
482
+ },
483
+ "21183": {
484
+ "content": "撾",
485
+ "lstrip": false,
486
+ "normalized": true,
487
+ "rstrip": false,
488
+ "single_word": false,
489
+ "special": false
490
+ },
491
+ "21184": {
492
+ "content": "揼",
493
+ "lstrip": false,
494
+ "normalized": true,
495
+ "rstrip": false,
496
+ "single_word": false,
497
+ "special": false
498
+ },
499
+ "21185": {
500
+ "content": "曱",
501
+ "lstrip": false,
502
+ "normalized": true,
503
+ "rstrip": false,
504
+ "single_word": false,
505
+ "special": false
506
+ },
507
+ "21186": {
508
+ "content": "卽",
509
+ "lstrip": false,
510
+ "normalized": true,
511
+ "rstrip": false,
512
+ "single_word": false,
513
+ "special": false
514
+ },
515
+ "21187": {
516
+ "content": "塱",
517
+ "lstrip": false,
518
+ "normalized": true,
519
+ "rstrip": false,
520
+ "single_word": false,
521
+ "special": false
522
+ },
523
+ "21188": {
524
+ "content": "拃",
525
+ "lstrip": false,
526
+ "normalized": true,
527
+ "rstrip": false,
528
+ "single_word": false,
529
+ "special": false
530
+ },
531
+ "21189": {
532
+ "content": "烚",
533
+ "lstrip": false,
534
+ "normalized": true,
535
+ "rstrip": false,
536
+ "single_word": false,
537
+ "special": false
538
+ },
539
+ "21190": {
540
+ "content": "甴",
541
+ "lstrip": false,
542
+ "normalized": true,
543
+ "rstrip": false,
544
+ "single_word": false,
545
+ "special": false
546
+ },
547
+ "21191": {
548
+ "content": "罅",
549
+ "lstrip": false,
550
+ "normalized": true,
551
+ "rstrip": false,
552
+ "single_word": false,
553
+ "special": false
554
+ },
555
+ "21192": {
556
+ "content": "喼",
557
+ "lstrip": false,
558
+ "normalized": true,
559
+ "rstrip": false,
560
+ "single_word": false,
561
+ "special": false
562
+ },
563
+ "21193": {
564
+ "content": "栢",
565
+ "lstrip": false,
566
+ "normalized": true,
567
+ "rstrip": false,
568
+ "single_word": false,
569
+ "special": false
570
+ },
571
+ "21194": {
572
+ "content": "砵",
573
+ "lstrip": false,
574
+ "normalized": true,
575
+ "rstrip": false,
576
+ "single_word": false,
577
+ "special": false
578
+ },
579
+ "21195": {
580
+ "content": "躄",
581
+ "lstrip": false,
582
+ "normalized": true,
583
+ "rstrip": false,
584
+ "single_word": false,
585
+ "special": false
586
+ },
587
+ "21196": {
588
+ "content": "軚",
589
+ "lstrip": false,
590
+ "normalized": true,
591
+ "rstrip": false,
592
+ "single_word": false,
593
+ "special": false
594
+ },
595
+ "21197": {
596
+ "content": "噃",
597
+ "lstrip": false,
598
+ "normalized": true,
599
+ "rstrip": false,
600
+ "single_word": false,
601
+ "special": false
602
+ },
603
+ "21198": {
604
+ "content": "獌",
605
+ "lstrip": false,
606
+ "normalized": true,
607
+ "rstrip": false,
608
+ "single_word": false,
609
+ "special": false
610
+ },
611
+ "21199": {
612
+ "content": "錡",
613
+ "lstrip": false,
614
+ "normalized": true,
615
+ "rstrip": false,
616
+ "single_word": false,
617
+ "special": false
618
+ },
619
+ "21200": {
620
+ "content": "閪",
621
+ "lstrip": false,
622
+ "normalized": true,
623
+ "rstrip": false,
624
+ "single_word": false,
625
+ "special": false
626
+ },
627
+ "21201": {
628
+ "content": "硤",
629
+ "lstrip": false,
630
+ "normalized": true,
631
+ "rstrip": false,
632
+ "single_word": false,
633
+ "special": false
634
+ },
635
+ "21202": {
636
+ "content": "踎",
637
+ "lstrip": false,
638
+ "normalized": true,
639
+ "rstrip": false,
640
+ "single_word": false,
641
+ "special": false
642
+ },
643
+ "21203": {
644
+ "content": "踭",
645
+ "lstrip": false,
646
+ "normalized": true,
647
+ "rstrip": false,
648
+ "single_word": false,
649
+ "special": false
650
+ },
651
+ "21204": {
652
+ "content": "呔",
653
+ "lstrip": false,
654
+ "normalized": true,
655
+ "rstrip": false,
656
+ "single_word": false,
657
+ "special": false
658
+ },
659
+ "21205": {
660
+ "content": "吔",
661
+ "lstrip": false,
662
+ "normalized": true,
663
+ "rstrip": false,
664
+ "single_word": false,
665
+ "special": false
666
+ },
667
+ "21206": {
668
+ "content": "跣",
669
+ "lstrip": false,
670
+ "normalized": true,
671
+ "rstrip": false,
672
+ "single_word": false,
673
+ "special": false
674
+ },
675
+ "21207": {
676
+ "content": "殻",
677
+ "lstrip": false,
678
+ "normalized": true,
679
+ "rstrip": false,
680
+ "single_word": false,
681
+ "special": false
682
+ },
683
+ "21208": {
684
+ "content": "鱲",
685
+ "lstrip": false,
686
+ "normalized": true,
687
+ "rstrip": false,
688
+ "single_word": false,
689
+ "special": false
690
+ },
691
+ "21209": {
692
+ "content": "摷",
693
+ "lstrip": false,
694
+ "normalized": true,
695
+ "rstrip": false,
696
+ "single_word": false,
697
+ "special": false
698
+ },
699
+ "21210": {
700
+ "content": "孲",
701
+ "lstrip": false,
702
+ "normalized": true,
703
+ "rstrip": false,
704
+ "single_word": false,
705
+ "special": false
706
+ },
707
+ "21211": {
708
+ "content": "逳",
709
+ "lstrip": false,
710
+ "normalized": true,
711
+ "rstrip": false,
712
+ "single_word": false,
713
+ "special": false
714
+ },
715
+ "21212": {
716
+ "content": "捽",
717
+ "lstrip": false,
718
+ "normalized": true,
719
+ "rstrip": false,
720
+ "single_word": false,
721
+ "special": false
722
+ },
723
+ "21213": {
724
+ "content": "鬅",
725
+ "lstrip": false,
726
+ "normalized": true,
727
+ "rstrip": false,
728
+ "single_word": false,
729
+ "special": false
730
+ },
731
+ "21214": {
732
+ "content": "掹",
733
+ "lstrip": false,
734
+ "normalized": true,
735
+ "rstrip": false,
736
+ "single_word": false,
737
+ "special": false
738
+ },
739
+ "21215": {
740
+ "content": "櫳",
741
+ "lstrip": false,
742
+ "normalized": true,
743
+ "rstrip": false,
744
+ "single_word": false,
745
+ "special": false
746
+ },
747
+ "21216": {
748
+ "content": "㞗",
749
+ "lstrip": false,
750
+ "normalized": true,
751
+ "rstrip": false,
752
+ "single_word": false,
753
+ "special": false
754
+ },
755
+ "21217": {
756
+ "content": "廸",
757
+ "lstrip": false,
758
+ "normalized": true,
759
+ "rstrip": false,
760
+ "single_word": false,
761
+ "special": false
762
+ },
763
+ "21218": {
764
+ "content": "鏝",
765
+ "lstrip": false,
766
+ "normalized": true,
767
+ "rstrip": false,
768
+ "single_word": false,
769
+ "special": false
770
+ },
771
+ "21219": {
772
+ "content": "梘",
773
+ "lstrip": false,
774
+ "normalized": true,
775
+ "rstrip": false,
776
+ "single_word": false,
777
+ "special": false
778
+ },
779
+ "21220": {
780
+ "content": "偲",
781
+ "lstrip": false,
782
+ "normalized": true,
783
+ "rstrip": false,
784
+ "single_word": false,
785
+ "special": false
786
+ },
787
+ "21221": {
788
+ "content": "褦",
789
+ "lstrip": false,
790
+ "normalized": true,
791
+ "rstrip": false,
792
+ "single_word": false,
793
+ "special": false
794
+ },
795
+ "21222": {
796
+ "content": "搣",
797
+ "lstrip": false,
798
+ "normalized": true,
799
+ "rstrip": false,
800
+ "single_word": false,
801
+ "special": false
802
+ },
803
+ "21223": {
804
+ "content": "唻",
805
+ "lstrip": false,
806
+ "normalized": true,
807
+ "rstrip": false,
808
+ "single_word": false,
809
+ "special": false
810
+ },
811
+ "21224": {
812
+ "content": "撘",
813
+ "lstrip": false,
814
+ "normalized": true,
815
+ "rstrip": false,
816
+ "single_word": false,
817
+ "special": false
818
+ },
819
+ "21225": {
820
+ "content": "麿",
821
+ "lstrip": false,
822
+ "normalized": true,
823
+ "rstrip": false,
824
+ "single_word": false,
825
+ "special": false
826
+ },
827
+ "21226": {
828
+ "content": "猁",
829
+ "lstrip": false,
830
+ "normalized": true,
831
+ "rstrip": false,
832
+ "single_word": false,
833
+ "special": false
834
+ },
835
+ "21227": {
836
+ "content": "煇",
837
+ "lstrip": false,
838
+ "normalized": true,
839
+ "rstrip": false,
840
+ "single_word": false,
841
+ "special": false
842
+ },
843
+ "21228": {
844
+ "content": "膥",
845
+ "lstrip": false,
846
+ "normalized": true,
847
+ "rstrip": false,
848
+ "single_word": false,
849
+ "special": false
850
+ },
851
+ "21229": {
852
+ "content": "烴",
853
+ "lstrip": false,
854
+ "normalized": true,
855
+ "rstrip": false,
856
+ "single_word": false,
857
+ "special": false
858
+ },
859
+ "21230": {
860
+ "content": "窰",
861
+ "lstrip": false,
862
+ "normalized": true,
863
+ "rstrip": false,
864
+ "single_word": false,
865
+ "special": false
866
+ },
867
+ "21231": {
868
+ "content": "鱇",
869
+ "lstrip": false,
870
+ "normalized": true,
871
+ "rstrip": false,
872
+ "single_word": false,
873
+ "special": false
874
+ },
875
+ "21232": {
876
+ "content": "鮟",
877
+ "lstrip": false,
878
+ "normalized": true,
879
+ "rstrip": false,
880
+ "single_word": false,
881
+ "special": false
882
+ },
883
+ "21233": {
884
+ "content": "蟶",
885
+ "lstrip": false,
886
+ "normalized": true,
887
+ "rstrip": false,
888
+ "single_word": false,
889
+ "special": false
890
+ },
891
+ "21234": {
892
+ "content": "佮",
893
+ "lstrip": false,
894
+ "normalized": true,
895
+ "rstrip": false,
896
+ "single_word": false,
897
+ "special": false
898
+ },
899
+ "21235": {
900
+ "content": "淥",
901
+ "lstrip": false,
902
+ "normalized": true,
903
+ "rstrip": false,
904
+ "single_word": false,
905
+ "special": false
906
+ },
907
+ "21236": {
908
+ "content": "嚡",
909
+ "lstrip": false,
910
+ "normalized": true,
911
+ "rstrip": false,
912
+ "single_word": false,
913
+ "special": false
914
+ },
915
+ "21237": {
916
+ "content": "謚",
917
+ "lstrip": false,
918
+ "normalized": true,
919
+ "rstrip": false,
920
+ "single_word": false,
921
+ "special": false
922
+ },
923
+ "21238": {
924
+ "content": "抺",
925
+ "lstrip": false,
926
+ "normalized": true,
927
+ "rstrip": false,
928
+ "single_word": false,
929
+ "special": false
930
+ },
931
+ "21239": {
932
+ "content": "輋",
933
+ "lstrip": false,
934
+ "normalized": true,
935
+ "rstrip": false,
936
+ "single_word": false,
937
+ "special": false
938
+ },
939
+ "21240": {
940
+ "content": "璠",
941
+ "lstrip": false,
942
+ "normalized": true,
943
+ "rstrip": false,
944
+ "single_word": false,
945
+ "special": false
946
+ },
947
+ "21241": {
948
+ "content": "潯",
949
+ "lstrip": false,
950
+ "normalized": true,
951
+ "rstrip": false,
952
+ "single_word": false,
953
+ "special": false
954
+ },
955
+ "21242": {
956
+ "content": "韃",
957
+ "lstrip": false,
958
+ "normalized": true,
959
+ "rstrip": false,
960
+ "single_word": false,
961
+ "special": false
962
+ },
963
+ "21243": {
964
+ "content": "鯪",
965
+ "lstrip": false,
966
+ "normalized": true,
967
+ "rstrip": false,
968
+ "single_word": false,
969
+ "special": false
970
+ },
971
+ "21244": {
972
+ "content": "詏",
973
+ "lstrip": false,
974
+ "normalized": true,
975
+ "rstrip": false,
976
+ "single_word": false,
977
+ "special": false
978
+ },
979
+ "21245": {
980
+ "content": "緡",
981
+ "lstrip": false,
982
+ "normalized": true,
983
+ "rstrip": false,
984
+ "single_word": false,
985
+ "special": false
986
+ },
987
+ "21246": {
988
+ "content": "繙",
989
+ "lstrip": false,
990
+ "normalized": true,
991
+ "rstrip": false,
992
+ "single_word": false,
993
+ "special": false
994
+ },
995
+ "21247": {
996
+ "content": "煬",
997
+ "lstrip": false,
998
+ "normalized": true,
999
+ "rstrip": false,
1000
+ "single_word": false,
1001
+ "special": false
1002
+ },
1003
+ "21248": {
1004
+ "content": "焫",
1005
+ "lstrip": false,
1006
+ "normalized": true,
1007
+ "rstrip": false,
1008
+ "single_word": false,
1009
+ "special": false
1010
+ },
1011
+ "21249": {
1012
+ "content": "鶉",
1013
+ "lstrip": false,
1014
+ "normalized": true,
1015
+ "rstrip": false,
1016
+ "single_word": false,
1017
+ "special": false
1018
+ },
1019
+ "21250": {
1020
+ "content": "唒",
1021
+ "lstrip": false,
1022
+ "normalized": true,
1023
+ "rstrip": false,
1024
+ "single_word": false,
1025
+ "special": false
1026
+ },
1027
+ "21251": {
1028
+ "content": "揈",
1029
+ "lstrip": false,
1030
+ "normalized": true,
1031
+ "rstrip": false,
1032
+ "single_word": false,
1033
+ "special": false
1034
+ },
1035
+ "21252": {
1036
+ "content": "紥",
1037
+ "lstrip": false,
1038
+ "normalized": true,
1039
+ "rstrip": false,
1040
+ "single_word": false,
1041
+ "special": false
1042
+ },
1043
+ "21253": {
1044
+ "content": "糭",
1045
+ "lstrip": false,
1046
+ "normalized": true,
1047
+ "rstrip": false,
1048
+ "single_word": false,
1049
+ "special": false
1050
+ },
1051
+ "21254": {
1052
+ "content": "韮",
1053
+ "lstrip": false,
1054
+ "normalized": true,
1055
+ "rstrip": false,
1056
+ "single_word": false,
1057
+ "special": false
1058
+ },
1059
+ "21255": {
1060
+ "content": "㚻",
1061
+ "lstrip": false,
1062
+ "normalized": true,
1063
+ "rstrip": false,
1064
+ "single_word": false,
1065
+ "special": false
1066
+ },
1067
+ "21256": {
1068
+ "content": "厠",
1069
+ "lstrip": false,
1070
+ "normalized": true,
1071
+ "rstrip": false,
1072
+ "single_word": false,
1073
+ "special": false
1074
+ },
1075
+ "21257": {
1076
+ "content": "膶",
1077
+ "lstrip": false,
1078
+ "normalized": true,
1079
+ "rstrip": false,
1080
+ "single_word": false,
1081
+ "special": false
1082
+ },
1083
+ "21258": {
1084
+ "content": "抌",
1085
+ "lstrip": false,
1086
+ "normalized": true,
1087
+ "rstrip": false,
1088
+ "single_word": false,
1089
+ "special": false
1090
+ },
1091
+ "21259": {
1092
+ "content": "蝻",
1093
+ "lstrip": false,
1094
+ "normalized": true,
1095
+ "rstrip": false,
1096
+ "single_word": false,
1097
+ "special": false
1098
+ },
1099
+ "21260": {
1100
+ "content": "綟",
1101
+ "lstrip": false,
1102
+ "normalized": true,
1103
+ "rstrip": false,
1104
+ "single_word": false,
1105
+ "special": false
1106
+ },
1107
+ "21261": {
1108
+ "content": "鍠",
1109
+ "lstrip": false,
1110
+ "normalized": true,
1111
+ "rstrip": false,
1112
+ "single_word": false,
1113
+ "special": false
1114
+ },
1115
+ "21262": {
1116
+ "content": "靭",
1117
+ "lstrip": false,
1118
+ "normalized": true,
1119
+ "rstrip": false,
1120
+ "single_word": false,
1121
+ "special": false
1122
+ },
1123
+ "21263": {
1124
+ "content": "糉",
1125
+ "lstrip": false,
1126
+ "normalized": true,
1127
+ "rstrip": false,
1128
+ "single_word": false,
1129
+ "special": false
1130
+ },
1131
+ "21264": {
1132
+ "content": "孻",
1133
+ "lstrip": false,
1134
+ "normalized": true,
1135
+ "rstrip": false,
1136
+ "single_word": false,
1137
+ "special": false
1138
+ },
1139
+ "21265": {
1140
+ "content": "㷫",
1141
+ "lstrip": false,
1142
+ "normalized": true,
1143
+ "rstrip": false,
1144
+ "single_word": false,
1145
+ "special": false
1146
+ },
1147
+ "21266": {
1148
+ "content": "抆",
1149
+ "lstrip": false,
1150
+ "normalized": true,
1151
+ "rstrip": false,
1152
+ "single_word": false,
1153
+ "special": false
1154
+ },
1155
+ "21267": {
1156
+ "content": "嶠",
1157
+ "lstrip": false,
1158
+ "normalized": true,
1159
+ "rstrip": false,
1160
+ "single_word": false,
1161
+ "special": false
1162
+ },
1163
+ "21268": {
1164
+ "content": "摑",
1165
+ "lstrip": false,
1166
+ "normalized": true,
1167
+ "rstrip": false,
1168
+ "single_word": false,
1169
+ "special": false
1170
+ },
1171
+ "21269": {
1172
+ "content": "幗",
1173
+ "lstrip": false,
1174
+ "normalized": true,
1175
+ "rstrip": false,
1176
+ "single_word": false,
1177
+ "special": false
1178
+ },
1179
+ "21270": {
1180
+ "content": "攷",
1181
+ "lstrip": false,
1182
+ "normalized": true,
1183
+ "rstrip": false,
1184
+ "single_word": false,
1185
+ "special": false
1186
+ },
1187
+ "21271": {
1188
+ "content": "拏",
1189
+ "lstrip": false,
1190
+ "normalized": true,
1191
+ "rstrip": false,
1192
+ "single_word": false,
1193
+ "special": false
1194
+ },
1195
+ "21272": {
1196
+ "content": "檠",
1197
+ "lstrip": false,
1198
+ "normalized": true,
1199
+ "rstrip": false,
1200
+ "single_word": false,
1201
+ "special": false
1202
+ },
1203
+ "21273": {
1204
+ "content": "鍚",
1205
+ "lstrip": false,
1206
+ "normalized": true,
1207
+ "rstrip": false,
1208
+ "single_word": false,
1209
+ "special": false
1210
+ },
1211
+ "21274": {
1212
+ "content": "躝",
1213
+ "lstrip": false,
1214
+ "normalized": true,
1215
+ "rstrip": false,
1216
+ "single_word": false,
1217
+ "special": false
1218
+ },
1219
+ "21275": {
1220
+ "content": "鴞",
1221
+ "lstrip": false,
1222
+ "normalized": true,
1223
+ "rstrip": false,
1224
+ "single_word": false,
1225
+ "special": false
1226
+ },
1227
+ "21276": {
1228
+ "content": "餬",
1229
+ "lstrip": false,
1230
+ "normalized": true,
1231
+ "rstrip": false,
1232
+ "single_word": false,
1233
+ "special": false
1234
+ },
1235
+ "21277": {
1236
+ "content": "簋",
1237
+ "lstrip": false,
1238
+ "normalized": true,
1239
+ "rstrip": false,
1240
+ "single_word": false,
1241
+ "special": false
1242
+ },
1243
+ "21278": {
1244
+ "content": "謳",
1245
+ "lstrip": false,
1246
+ "normalized": true,
1247
+ "rstrip": false,
1248
+ "single_word": false,
1249
+ "special": false
1250
+ },
1251
+ "21279": {
1252
+ "content": "幪",
1253
+ "lstrip": false,
1254
+ "normalized": true,
1255
+ "rstrip": false,
1256
+ "single_word": false,
1257
+ "special": false
1258
+ },
1259
+ "21280": {
1260
+ "content": "旯",
1261
+ "lstrip": false,
1262
+ "normalized": true,
1263
+ "rstrip": false,
1264
+ "single_word": false,
1265
+ "special": false
1266
+ },
1267
+ "21281": {
1268
+ "content": "戙",
1269
+ "lstrip": false,
1270
+ "normalized": true,
1271
+ "rstrip": false,
1272
+ "single_word": false,
1273
+ "special": false
1274
+ },
1275
+ "21282": {
1276
+ "content": "嚙",
1277
+ "lstrip": false,
1278
+ "normalized": true,
1279
+ "rstrip": false,
1280
+ "single_word": false,
1281
+ "special": false
1282
+ },
1283
+ "21283": {
1284
+ "content": "鵪",
1285
+ "lstrip": false,
1286
+ "normalized": true,
1287
+ "rstrip": false,
1288
+ "single_word": false,
1289
+ "special": false
1290
+ },
1291
+ "21284": {
1292
+ "content": "卌",
1293
+ "lstrip": false,
1294
+ "normalized": true,
1295
+ "rstrip": false,
1296
+ "single_word": false,
1297
+ "special": false
1298
+ },
1299
+ "21285": {
1300
+ "content": "厹",
1301
+ "lstrip": false,
1302
+ "normalized": true,
1303
+ "rstrip": false,
1304
+ "single_word": false,
1305
+ "special": false
1306
+ },
1307
+ "21286": {
1308
+ "content": "竈",
1309
+ "lstrip": false,
1310
+ "normalized": true,
1311
+ "rstrip": false,
1312
+ "single_word": false,
1313
+ "special": false
1314
+ },
1315
+ "21287": {
1316
+ "content": "鄴",
1317
+ "lstrip": false,
1318
+ "normalized": true,
1319
+ "rstrip": false,
1320
+ "single_word": false,
1321
+ "special": false
1322
+ },
1323
+ "21288": {
1324
+ "content": "埲",
1325
+ "lstrip": false,
1326
+ "normalized": true,
1327
+ "rstrip": false,
1328
+ "single_word": false,
1329
+ "special": false
1330
+ },
1331
+ "21289": {
1332
+ "content": "竉",
1333
+ "lstrip": false,
1334
+ "normalized": true,
1335
+ "rstrip": false,
1336
+ "single_word": false,
1337
+ "special": false
1338
+ },
1339
+ "21290": {
1340
+ "content": "覈",
1341
+ "lstrip": false,
1342
+ "normalized": true,
1343
+ "rstrip": false,
1344
+ "single_word": false,
1345
+ "special": false
1346
+ },
1347
+ "21291": {
1348
+ "content": "螈",
1349
+ "lstrip": false,
1350
+ "normalized": true,
1351
+ "rstrip": false,
1352
+ "single_word": false,
1353
+ "special": false
1354
+ },
1355
+ "21292": {
1356
+ "content": "澌",
1357
+ "lstrip": false,
1358
+ "normalized": true,
1359
+ "rstrip": false,
1360
+ "single_word": false,
1361
+ "special": false
1362
+ },
1363
+ "21293": {
1364
+ "content": "扺",
1365
+ "lstrip": false,
1366
+ "normalized": true,
1367
+ "rstrip": false,
1368
+ "single_word": false,
1369
+ "special": false
1370
+ },
1371
+ "21294": {
1372
+ "content": "埐",
1373
+ "lstrip": false,
1374
+ "normalized": true,
1375
+ "rstrip": false,
1376
+ "single_word": false,
1377
+ "special": false
1378
+ },
1379
+ "21295": {
1380
+ "content": "殮",
1381
+ "lstrip": false,
1382
+ "normalized": true,
1383
+ "rstrip": false,
1384
+ "single_word": false,
1385
+ "special": false
1386
+ },
1387
+ "21296": {
1388
+ "content": "睺",
1389
+ "lstrip": false,
1390
+ "normalized": true,
1391
+ "rstrip": false,
1392
+ "single_word": false,
1393
+ "special": false
1394
+ },
1395
+ "21297": {
1396
+ "content": "縉",
1397
+ "lstrip": false,
1398
+ "normalized": true,
1399
+ "rstrip": false,
1400
+ "single_word": false,
1401
+ "special": false
1402
+ },
1403
+ "21298": {
1404
+ "content": "鷂",
1405
+ "lstrip": false,
1406
+ "normalized": true,
1407
+ "rstrip": false,
1408
+ "single_word": false,
1409
+ "special": false
1410
+ },
1411
+ "21299": {
1412
+ "content": "璣",
1413
+ "lstrip": false,
1414
+ "normalized": true,
1415
+ "rstrip": false,
1416
+ "single_word": false,
1417
+ "special": false
1418
+ },
1419
+ "21300": {
1420
+ "content": "咇",
1421
+ "lstrip": false,
1422
+ "normalized": true,
1423
+ "rstrip": false,
1424
+ "single_word": false,
1425
+ "special": false
1426
+ },
1427
+ "21301": {
1428
+ "content": "忟",
1429
+ "lstrip": false,
1430
+ "normalized": true,
1431
+ "rstrip": false,
1432
+ "single_word": false,
1433
+ "special": false
1434
+ },
1435
+ "21302": {
1436
+ "content": "藪",
1437
+ "lstrip": false,
1438
+ "normalized": true,
1439
+ "rstrip": false,
1440
+ "single_word": false,
1441
+ "special": false
1442
+ },
1443
+ "21303": {
1444
+ "content": "脧",
1445
+ "lstrip": false,
1446
+ "normalized": true,
1447
+ "rstrip": false,
1448
+ "single_word": false,
1449
+ "special": false
1450
+ },
1451
+ "21304": {
1452
+ "content": "裇",
1453
+ "lstrip": false,
1454
+ "normalized": true,
1455
+ "rstrip": false,
1456
+ "single_word": false,
1457
+ "special": false
1458
+ },
1459
+ "21305": {
1460
+ "content": "鯇",
1461
+ "lstrip": false,
1462
+ "normalized": true,
1463
+ "rstrip": false,
1464
+ "single_word": false,
1465
+ "special": false
1466
+ },
1467
+ "21306": {
1468
+ "content": "奀",
1469
+ "lstrip": false,
1470
+ "normalized": true,
1471
+ "rstrip": false,
1472
+ "single_word": false,
1473
+ "special": false
1474
+ },
1475
+ "21307": {
1476
+ "content": "㨂",
1477
+ "lstrip": false,
1478
+ "normalized": true,
1479
+ "rstrip": false,
1480
+ "single_word": false,
1481
+ "special": false
1482
+ },
1483
+ "21308": {
1484
+ "content": "鉸",
1485
+ "lstrip": false,
1486
+ "normalized": true,
1487
+ "rstrip": false,
1488
+ "single_word": false,
1489
+ "special": false
1490
+ },
1491
+ "21309": {
1492
+ "content": "堊",
1493
+ "lstrip": false,
1494
+ "normalized": true,
1495
+ "rstrip": false,
1496
+ "single_word": false,
1497
+ "special": false
1498
+ },
1499
+ "21310": {
1500
+ "content": "邴",
1501
+ "lstrip": false,
1502
+ "normalized": true,
1503
+ "rstrip": false,
1504
+ "single_word": false,
1505
+ "special": false
1506
+ },
1507
+ "21311": {
1508
+ "content": "嫗",
1509
+ "lstrip": false,
1510
+ "normalized": true,
1511
+ "rstrip": false,
1512
+ "single_word": false,
1513
+ "special": false
1514
+ },
1515
+ "21312": {
1516
+ "content": "菴",
1517
+ "lstrip": false,
1518
+ "normalized": true,
1519
+ "rstrip": false,
1520
+ "single_word": false,
1521
+ "special": false
1522
+ },
1523
+ "21313": {
1524
+ "content": "鼆",
1525
+ "lstrip": false,
1526
+ "normalized": true,
1527
+ "rstrip": false,
1528
+ "single_word": false,
1529
+ "special": false
1530
+ },
1531
+ "21314": {
1532
+ "content": "騫",
1533
+ "lstrip": false,
1534
+ "normalized": true,
1535
+ "rstrip": false,
1536
+ "single_word": false,
1537
+ "special": false
1538
+ },
1539
+ "21315": {
1540
+ "content": "坭",
1541
+ "lstrip": false,
1542
+ "normalized": true,
1543
+ "rstrip": false,
1544
+ "single_word": false,
1545
+ "special": false
1546
+ },
1547
+ "21316": {
1548
+ "content": "喥",
1549
+ "lstrip": false,
1550
+ "normalized": true,
1551
+ "rstrip": false,
1552
+ "single_word": false,
1553
+ "special": false
1554
+ },
1555
+ "21317": {
1556
+ "content": "搲",
1557
+ "lstrip": false,
1558
+ "normalized": true,
1559
+ "rstrip": false,
1560
+ "single_word": false,
1561
+ "special": false
1562
+ },
1563
+ "21318": {
1564
+ "content": "瓚",
1565
+ "lstrip": false,
1566
+ "normalized": true,
1567
+ "rstrip": false,
1568
+ "single_word": false,
1569
+ "special": false
1570
+ },
1571
+ "21319": {
1572
+ "content": "顳",
1573
+ "lstrip": false,
1574
+ "normalized": true,
1575
+ "rstrip": false,
1576
+ "single_word": false,
1577
+ "special": false
1578
+ },
1579
+ "21320": {
1580
+ "content": "鬲",
1581
+ "lstrip": false,
1582
+ "normalized": true,
1583
+ "rstrip": false,
1584
+ "single_word": false,
1585
+ "special": false
1586
+ },
1587
+ "21321": {
1588
+ "content": "鵐",
1589
+ "lstrip": false,
1590
+ "normalized": true,
1591
+ "rstrip": false,
1592
+ "single_word": false,
1593
+ "special": false
1594
+ },
1595
+ "21322": {
1596
+ "content": "畧",
1597
+ "lstrip": false,
1598
+ "normalized": true,
1599
+ "rstrip": false,
1600
+ "single_word": false,
1601
+ "special": false
1602
+ },
1603
+ "21323": {
1604
+ "content": "嫰",
1605
+ "lstrip": false,
1606
+ "normalized": true,
1607
+ "rstrip": false,
1608
+ "single_word": false,
1609
+ "special": false
1610
+ },
1611
+ "21324": {
1612
+ "content": "罟",
1613
+ "lstrip": false,
1614
+ "normalized": true,
1615
+ "rstrip": false,
1616
+ "single_word": false,
1617
+ "special": false
1618
+ },
1619
+ "21325": {
1620
+ "content": "���",
1621
+ "lstrip": false,
1622
+ "normalized": true,
1623
+ "rstrip": false,
1624
+ "single_word": false,
1625
+ "special": false
1626
+ },
1627
+ "21326": {
1628
+ "content": "蠄",
1629
+ "lstrip": false,
1630
+ "normalized": true,
1631
+ "rstrip": false,
1632
+ "single_word": false,
1633
+ "special": false
1634
+ },
1635
+ "21327": {
1636
+ "content": "頴",
1637
+ "lstrip": false,
1638
+ "normalized": true,
1639
+ "rstrip": false,
1640
+ "single_word": false,
1641
+ "special": false
1642
+ },
1643
+ "21328": {
1644
+ "content": "蠑",
1645
+ "lstrip": false,
1646
+ "normalized": true,
1647
+ "rstrip": false,
1648
+ "single_word": false,
1649
+ "special": false
1650
+ },
1651
+ "21329": {
1652
+ "content": "氼",
1653
+ "lstrip": false,
1654
+ "normalized": true,
1655
+ "rstrip": false,
1656
+ "single_word": false,
1657
+ "special": false
1658
+ },
1659
+ "21330": {
1660
+ "content": "㖭",
1661
+ "lstrip": false,
1662
+ "normalized": true,
1663
+ "rstrip": false,
1664
+ "single_word": false,
1665
+ "special": false
1666
+ },
1667
+ "21331": {
1668
+ "content": "呑",
1669
+ "lstrip": false,
1670
+ "normalized": true,
1671
+ "rstrip": false,
1672
+ "single_word": false,
1673
+ "special": false
1674
+ },
1675
+ "21332": {
1676
+ "content": "胐",
1677
+ "lstrip": false,
1678
+ "normalized": true,
1679
+ "rstrip": false,
1680
+ "single_word": false,
1681
+ "special": false
1682
+ },
1683
+ "21333": {
1684
+ "content": "癈",
1685
+ "lstrip": false,
1686
+ "normalized": true,
1687
+ "rstrip": false,
1688
+ "single_word": false,
1689
+ "special": false
1690
+ },
1691
+ "21334": {
1692
+ "content": "擸",
1693
+ "lstrip": false,
1694
+ "normalized": true,
1695
+ "rstrip": false,
1696
+ "single_word": false,
1697
+ "special": false
1698
+ },
1699
+ "21335": {
1700
+ "content": "彖",
1701
+ "lstrip": false,
1702
+ "normalized": true,
1703
+ "rstrip": false,
1704
+ "single_word": false,
1705
+ "special": false
1706
+ },
1707
+ "21336": {
1708
+ "content": "驃",
1709
+ "lstrip": false,
1710
+ "normalized": true,
1711
+ "rstrip": false,
1712
+ "single_word": false,
1713
+ "special": false
1714
+ },
1715
+ "21337": {
1716
+ "content": "驄",
1717
+ "lstrip": false,
1718
+ "normalized": true,
1719
+ "rstrip": false,
1720
+ "single_word": false,
1721
+ "special": false
1722
+ },
1723
+ "21338": {
1724
+ "content": "疴",
1725
+ "lstrip": false,
1726
+ "normalized": true,
1727
+ "rstrip": false,
1728
+ "single_word": false,
1729
+ "special": false
1730
+ },
1731
+ "21339": {
1732
+ "content": "鎅",
1733
+ "lstrip": false,
1734
+ "normalized": true,
1735
+ "rstrip": false,
1736
+ "single_word": false,
1737
+ "special": false
1738
+ },
1739
+ "21340": {
1740
+ "content": "涷",
1741
+ "lstrip": false,
1742
+ "normalized": true,
1743
+ "rstrip": false,
1744
+ "single_word": false,
1745
+ "special": false
1746
+ },
1747
+ "21341": {
1748
+ "content": "穏",
1749
+ "lstrip": false,
1750
+ "normalized": true,
1751
+ "rstrip": false,
1752
+ "single_word": false,
1753
+ "special": false
1754
+ },
1755
+ "21342": {
1756
+ "content": "硏",
1757
+ "lstrip": false,
1758
+ "normalized": true,
1759
+ "rstrip": false,
1760
+ "single_word": false,
1761
+ "special": false
1762
+ },
1763
+ "21343": {
1764
+ "content": "蒴",
1765
+ "lstrip": false,
1766
+ "normalized": true,
1767
+ "rstrip": false,
1768
+ "single_word": false,
1769
+ "special": false
1770
+ },
1771
+ "21344": {
1772
+ "content": "鍶",
1773
+ "lstrip": false,
1774
+ "normalized": true,
1775
+ "rstrip": false,
1776
+ "single_word": false,
1777
+ "special": false
1778
+ },
1779
+ "21345": {
1780
+ "content": "鰹",
1781
+ "lstrip": false,
1782
+ "normalized": true,
1783
+ "rstrip": false,
1784
+ "single_word": false,
1785
+ "special": false
1786
+ },
1787
+ "21346": {
1788
+ "content": "嚫",
1789
+ "lstrip": false,
1790
+ "normalized": true,
1791
+ "rstrip": false,
1792
+ "single_word": false,
1793
+ "special": false
1794
+ },
1795
+ "21347": {
1796
+ "content": "煱",
1797
+ "lstrip": false,
1798
+ "normalized": true,
1799
+ "rstrip": false,
1800
+ "single_word": false,
1801
+ "special": false
1802
+ },
1803
+ "21348": {
1804
+ "content": "頊",
1805
+ "lstrip": false,
1806
+ "normalized": true,
1807
+ "rstrip": false,
1808
+ "single_word": false,
1809
+ "special": false
1810
+ },
1811
+ "21349": {
1812
+ "content": "軫",
1813
+ "lstrip": false,
1814
+ "normalized": true,
1815
+ "rstrip": false,
1816
+ "single_word": false,
1817
+ "special": false
1818
+ },
1819
+ "21350": {
1820
+ "content": "髹",
1821
+ "lstrip": false,
1822
+ "normalized": true,
1823
+ "rstrip": false,
1824
+ "single_word": false,
1825
+ "special": false
1826
+ },
1827
+ "21351": {
1828
+ "content": "肶",
1829
+ "lstrip": false,
1830
+ "normalized": true,
1831
+ "rstrip": false,
1832
+ "single_word": false,
1833
+ "special": false
1834
+ },
1835
+ "21352": {
1836
+ "content": "鮫",
1837
+ "lstrip": false,
1838
+ "normalized": true,
1839
+ "rstrip": false,
1840
+ "single_word": false,
1841
+ "special": false
1842
+ },
1843
+ "21353": {
1844
+ "content": "惗",
1845
+ "lstrip": false,
1846
+ "normalized": true,
1847
+ "rstrip": false,
1848
+ "single_word": false,
1849
+ "special": false
1850
+ },
1851
+ "21354": {
1852
+ "content": "韞",
1853
+ "lstrip": false,
1854
+ "normalized": true,
1855
+ "rstrip": false,
1856
+ "single_word": false,
1857
+ "special": false
1858
+ },
1859
+ "21355": {
1860
+ "content": "骹",
1861
+ "lstrip": false,
1862
+ "normalized": true,
1863
+ "rstrip": false,
1864
+ "single_word": false,
1865
+ "special": false
1866
+ },
1867
+ "21356": {
1868
+ "content": "駖",
1869
+ "lstrip": false,
1870
+ "normalized": true,
1871
+ "rstrip": false,
1872
+ "single_word": false,
1873
+ "special": false
1874
+ },
1875
+ "21357": {
1876
+ "content": "軛",
1877
+ "lstrip": false,
1878
+ "normalized": true,
1879
+ "rstrip": false,
1880
+ "single_word": false,
1881
+ "special": false
1882
+ },
1883
+ "21358": {
1884
+ "content": "蜑",
1885
+ "lstrip": false,
1886
+ "normalized": true,
1887
+ "rstrip": false,
1888
+ "single_word": false,
1889
+ "special": false
1890
+ },
1891
+ "21359": {
1892
+ "content": "篋",
1893
+ "lstrip": false,
1894
+ "normalized": true,
1895
+ "rstrip": false,
1896
+ "single_word": false,
1897
+ "special": false
1898
+ },
1899
+ "21360": {
1900
+ "content": "蟧",
1901
+ "lstrip": false,
1902
+ "normalized": true,
1903
+ "rstrip": false,
1904
+ "single_word": false,
1905
+ "special": false
1906
+ },
1907
+ "21361": {
1908
+ "content": "鸛",
1909
+ "lstrip": false,
1910
+ "normalized": true,
1911
+ "rstrip": false,
1912
+ "single_word": false,
1913
+ "special": false
1914
+ },
1915
+ "21362": {
1916
+ "content": "鎢",
1917
+ "lstrip": false,
1918
+ "normalized": true,
1919
+ "rstrip": false,
1920
+ "single_word": false,
1921
+ "special": false
1922
+ },
1923
+ "21363": {
1924
+ "content": "淸",
1925
+ "lstrip": false,
1926
+ "normalized": true,
1927
+ "rstrip": false,
1928
+ "single_word": false,
1929
+ "special": false
1930
+ },
1931
+ "21364": {
1932
+ "content": "騭",
1933
+ "lstrip": false,
1934
+ "normalized": true,
1935
+ "rstrip": false,
1936
+ "single_word": false,
1937
+ "special": false
1938
+ },
1939
+ "21365": {
1940
+ "content": "銨",
1941
+ "lstrip": false,
1942
+ "normalized": true,
1943
+ "rstrip": false,
1944
+ "single_word": false,
1945
+ "special": false
1946
+ },
1947
+ "21366": {
1948
+ "content": "閙",
1949
+ "lstrip": false,
1950
+ "normalized": true,
1951
+ "rstrip": false,
1952
+ "single_word": false,
1953
+ "special": false
1954
+ },
1955
+ "21367": {
1956
+ "content": "稈",
1957
+ "lstrip": false,
1958
+ "normalized": true,
1959
+ "rstrip": false,
1960
+ "single_word": false,
1961
+ "special": false
1962
+ },
1963
+ "21368": {
1964
+ "content": "兗",
1965
+ "lstrip": false,
1966
+ "normalized": true,
1967
+ "rstrip": false,
1968
+ "single_word": false,
1969
+ "special": false
1970
+ },
1971
+ "21369": {
1972
+ "content": "榚",
1973
+ "lstrip": false,
1974
+ "normalized": true,
1975
+ "rstrip": false,
1976
+ "single_word": false,
1977
+ "special": false
1978
+ },
1979
+ "21370": {
1980
+ "content": "珓",
1981
+ "lstrip": false,
1982
+ "normalized": true,
1983
+ "rstrip": false,
1984
+ "single_word": false,
1985
+ "special": false
1986
+ },
1987
+ "21371": {
1988
+ "content": "揦",
1989
+ "lstrip": false,
1990
+ "normalized": true,
1991
+ "rstrip": false,
1992
+ "single_word": false,
1993
+ "special": false
1994
+ },
1995
+ "21372": {
1996
+ "content": "錕",
1997
+ "lstrip": false,
1998
+ "normalized": true,
1999
+ "rstrip": false,
2000
+ "single_word": false,
2001
+ "special": false
2002
+ },
2003
+ "21373": {
2004
+ "content": "鷸",
2005
+ "lstrip": false,
2006
+ "normalized": true,
2007
+ "rstrip": false,
2008
+ "single_word": false,
2009
+ "special": false
2010
+ },
2011
+ "21374": {
2012
+ "content": "舘",
2013
+ "lstrip": false,
2014
+ "normalized": true,
2015
+ "rstrip": false,
2016
+ "single_word": false,
2017
+ "special": false
2018
+ },
2019
+ "21375": {
2020
+ "content": "礮",
2021
+ "lstrip": false,
2022
+ "normalized": true,
2023
+ "rstrip": false,
2024
+ "single_word": false,
2025
+ "special": false
2026
+ },
2027
+ "21376": {
2028
+ "content": "哚",
2029
+ "lstrip": false,
2030
+ "normalized": true,
2031
+ "rstrip": false,
2032
+ "single_word": false,
2033
+ "special": false
2034
+ },
2035
+ "21377": {
2036
+ "content": "昪",
2037
+ "lstrip": false,
2038
+ "normalized": true,
2039
+ "rstrip": false,
2040
+ "single_word": false,
2041
+ "special": false
2042
+ },
2043
+ "21378": {
2044
+ "content": "羶",
2045
+ "lstrip": false,
2046
+ "normalized": true,
2047
+ "rstrip": false,
2048
+ "single_word": false,
2049
+ "special": false
2050
+ },
2051
+ "21379": {
2052
+ "content": "鍬",
2053
+ "lstrip": false,
2054
+ "normalized": true,
2055
+ "rstrip": false,
2056
+ "single_word": false,
2057
+ "special": false
2058
+ },
2059
+ "21380": {
2060
+ "content": "昺",
2061
+ "lstrip": false,
2062
+ "normalized": true,
2063
+ "rstrip": false,
2064
+ "single_word": false,
2065
+ "special": false
2066
+ },
2067
+ "21381": {
2068
+ "content": "閤",
2069
+ "lstrip": false,
2070
+ "normalized": true,
2071
+ "rstrip": false,
2072
+ "single_word": false,
2073
+ "special": false
2074
+ },
2075
+ "21382": {
2076
+ "content": "鯭",
2077
+ "lstrip": false,
2078
+ "normalized": true,
2079
+ "rstrip": false,
2080
+ "single_word": false,
2081
+ "special": false
2082
+ },
2083
+ "21383": {
2084
+ "content": "搦",
2085
+ "lstrip": false,
2086
+ "normalized": true,
2087
+ "rstrip": false,
2088
+ "single_word": false,
2089
+ "special": false
2090
+ },
2091
+ "21384": {
2092
+ "content": "㞘",
2093
+ "lstrip": false,
2094
+ "normalized": true,
2095
+ "rstrip": false,
2096
+ "single_word": false,
2097
+ "special": false
2098
+ },
2099
+ "21385": {
2100
+ "content": "椏",
2101
+ "lstrip": false,
2102
+ "normalized": true,
2103
+ "rstrip": false,
2104
+ "single_word": false,
2105
+ "special": false
2106
+ },
2107
+ "21386": {
2108
+ "content": "燾",
2109
+ "lstrip": false,
2110
+ "normalized": true,
2111
+ "rstrip": false,
2112
+ "single_word": false,
2113
+ "special": false
2114
+ },
2115
+ "21387": {
2116
+ "content": "鉼",
2117
+ "lstrip": false,
2118
+ "normalized": true,
2119
+ "rstrip": false,
2120
+ "single_word": false,
2121
+ "special": false
2122
+ },
2123
+ "21388": {
2124
+ "content": "儍",
2125
+ "lstrip": false,
2126
+ "normalized": true,
2127
+ "rstrip": false,
2128
+ "single_word": false,
2129
+ "special": false
2130
+ },
2131
+ "21389": {
2132
+ "content": "慤",
2133
+ "lstrip": false,
2134
+ "normalized": true,
2135
+ "rstrip": false,
2136
+ "single_word": false,
2137
+ "special": false
2138
+ },
2139
+ "21390": {
2140
+ "content": "耖",
2141
+ "lstrip": false,
2142
+ "normalized": true,
2143
+ "rstrip": false,
2144
+ "single_word": false,
2145
+ "special": false
2146
+ },
2147
+ "21391": {
2148
+ "content": "蛺",
2149
+ "lstrip": false,
2150
+ "normalized": true,
2151
+ "rstrip": false,
2152
+ "single_word": false,
2153
+ "special": false
2154
+ },
2155
+ "21392": {
2156
+ "content": "岃",
2157
+ "lstrip": false,
2158
+ "normalized": true,
2159
+ "rstrip": false,
2160
+ "single_word": false,
2161
+ "special": false
2162
+ },
2163
+ "21393": {
2164
+ "content": "琤",
2165
+ "lstrip": false,
2166
+ "normalized": true,
2167
+ "rstrip": false,
2168
+ "single_word": false,
2169
+ "special": false
2170
+ },
2171
+ "21394": {
2172
+ "content": "濰",
2173
+ "lstrip": false,
2174
+ "normalized": true,
2175
+ "rstrip": false,
2176
+ "single_word": false,
2177
+ "special": false
2178
+ },
2179
+ "21395": {
2180
+ "content": "鱘",
2181
+ "lstrip": false,
2182
+ "normalized": true,
2183
+ "rstrip": false,
2184
+ "single_word": false,
2185
+ "special": false
2186
+ },
2187
+ "21396": {
2188
+ "content": "潁",
2189
+ "lstrip": false,
2190
+ "normalized": true,
2191
+ "rstrip": false,
2192
+ "single_word": false,
2193
+ "special": false
2194
+ },
2195
+ "21397": {
2196
+ "content": "鏇",
2197
+ "lstrip": false,
2198
+ "normalized": true,
2199
+ "rstrip": false,
2200
+ "single_word": false,
2201
+ "special": false
2202
+ },
2203
+ "21398": {
2204
+ "content": "搾",
2205
+ "lstrip": false,
2206
+ "normalized": true,
2207
+ "rstrip": false,
2208
+ "single_word": false,
2209
+ "special": false
2210
+ },
2211
+ "21399": {
2212
+ "content": "旼",
2213
+ "lstrip": false,
2214
+ "normalized": true,
2215
+ "rstrip": false,
2216
+ "single_word": false,
2217
+ "special": false
2218
+ },
2219
+ "21400": {
2220
+ "content": "嬋",
2221
+ "lstrip": false,
2222
+ "normalized": true,
2223
+ "rstrip": false,
2224
+ "single_word": false,
2225
+ "special": false
2226
+ },
2227
+ "21401": {
2228
+ "content": "唓",
2229
+ "lstrip": false,
2230
+ "normalized": true,
2231
+ "rstrip": false,
2232
+ "single_word": false,
2233
+ "special": false
2234
+ },
2235
+ "21402": {
2236
+ "content": "唂",
2237
+ "lstrip": false,
2238
+ "normalized": true,
2239
+ "rstrip": false,
2240
+ "single_word": false,
2241
+ "special": false
2242
+ },
2243
+ "21403": {
2244
+ "content": "吲",
2245
+ "lstrip": false,
2246
+ "normalized": true,
2247
+ "rstrip": false,
2248
+ "single_word": false,
2249
+ "special": false
2250
+ },
2251
+ "21404": {
2252
+ "content": "踼",
2253
+ "lstrip": false,
2254
+ "normalized": true,
2255
+ "rstrip": false,
2256
+ "single_word": false,
2257
+ "special": false
2258
+ },
2259
+ "21405": {
2260
+ "content": "靑",
2261
+ "lstrip": false,
2262
+ "normalized": true,
2263
+ "rstrip": false,
2264
+ "single_word": false,
2265
+ "special": false
2266
+ },
2267
+ "21406": {
2268
+ "content": "鄕",
2269
+ "lstrip": false,
2270
+ "normalized": true,
2271
+ "rstrip": false,
2272
+ "single_word": false,
2273
+ "special": false
2274
+ },
2275
+ "21407": {
2276
+ "content": "湴",
2277
+ "lstrip": false,
2278
+ "normalized": true,
2279
+ "rstrip": false,
2280
+ "single_word": false,
2281
+ "special": false
2282
+ },
2283
+ "21408": {
2284
+ "content": "鬈",
2285
+ "lstrip": false,
2286
+ "normalized": true,
2287
+ "rstrip": false,
2288
+ "single_word": false,
2289
+ "special": false
2290
+ },
2291
+ "21409": {
2292
+ "content": "閭",
2293
+ "lstrip": false,
2294
+ "normalized": true,
2295
+ "rstrip": false,
2296
+ "single_word": false,
2297
+ "special": false
2298
+ },
2299
+ "21410": {
2300
+ "content": "癩",
2301
+ "lstrip": false,
2302
+ "normalized": true,
2303
+ "rstrip": false,
2304
+ "single_word": false,
2305
+ "special": false
2306
+ },
2307
+ "21411": {
2308
+ "content": "顥",
2309
+ "lstrip": false,
2310
+ "normalized": true,
2311
+ "rstrip": false,
2312
+ "single_word": false,
2313
+ "special": false
2314
+ },
2315
+ "21412": {
2316
+ "content": "訌",
2317
+ "lstrip": false,
2318
+ "normalized": true,
2319
+ "rstrip": false,
2320
+ "single_word": false,
2321
+ "special": false
2322
+ },
2323
+ "21413": {
2324
+ "content": "樴",
2325
+ "lstrip": false,
2326
+ "normalized": true,
2327
+ "rstrip": false,
2328
+ "single_word": false,
2329
+ "special": false
2330
+ },
2331
+ "21414": {
2332
+ "content": "饀",
2333
+ "lstrip": false,
2334
+ "normalized": true,
2335
+ "rstrip": false,
2336
+ "single_word": false,
2337
+ "special": false
2338
+ },
2339
+ "21415": {
2340
+ "content": "獴",
2341
+ "lstrip": false,
2342
+ "normalized": true,
2343
+ "rstrip": false,
2344
+ "single_word": false,
2345
+ "special": false
2346
+ },
2347
+ "21416": {
2348
+ "content": "畵",
2349
+ "lstrip": false,
2350
+ "normalized": true,
2351
+ "rstrip": false,
2352
+ "single_word": false,
2353
+ "special": false
2354
+ },
2355
+ "21417": {
2356
+ "content": "祼",
2357
+ "lstrip": false,
2358
+ "normalized": true,
2359
+ "rstrip": false,
2360
+ "single_word": false,
2361
+ "special": false
2362
+ },
2363
+ "21418": {
2364
+ "content": "簒",
2365
+ "lstrip": false,
2366
+ "normalized": true,
2367
+ "rstrip": false,
2368
+ "single_word": false,
2369
+ "special": false
2370
+ },
2371
+ "21419": {
2372
+ "content": "蟈",
2373
+ "lstrip": false,
2374
+ "normalized": true,
2375
+ "rstrip": false,
2376
+ "single_word": false,
2377
+ "special": false
2378
+ },
2379
+ "21420": {
2380
+ "content": "灕",
2381
+ "lstrip": false,
2382
+ "normalized": true,
2383
+ "rstrip": false,
2384
+ "single_word": false,
2385
+ "special": false
2386
+ },
2387
+ "21421": {
2388
+ "content": "僆",
2389
+ "lstrip": false,
2390
+ "normalized": true,
2391
+ "rstrip": false,
2392
+ "single_word": false,
2393
+ "special": false
2394
+ },
2395
+ "21422": {
2396
+ "content": "儁",
2397
+ "lstrip": false,
2398
+ "normalized": true,
2399
+ "rstrip": false,
2400
+ "single_word": false,
2401
+ "special": false
2402
+ },
2403
+ "21423": {
2404
+ "content": "瑂",
2405
+ "lstrip": false,
2406
+ "normalized": true,
2407
+ "rstrip": false,
2408
+ "single_word": false,
2409
+ "special": false
2410
+ },
2411
+ "21424": {
2412
+ "content": "趷",
2413
+ "lstrip": false,
2414
+ "normalized": true,
2415
+ "rstrip": false,
2416
+ "single_word": false,
2417
+ "special": false
2418
+ },
2419
+ "21425": {
2420
+ "content": "糴",
2421
+ "lstrip": false,
2422
+ "normalized": true,
2423
+ "rstrip": false,
2424
+ "single_word": false,
2425
+ "special": false
2426
+ },
2427
+ "21426": {
2428
+ "content": "躂",
2429
+ "lstrip": false,
2430
+ "normalized": true,
2431
+ "rstrip": false,
2432
+ "single_word": false,
2433
+ "special": false
2434
+ },
2435
+ "21427": {
2436
+ "content": "顓",
2437
+ "lstrip": false,
2438
+ "normalized": true,
2439
+ "rstrip": false,
2440
+ "single_word": false,
2441
+ "special": false
2442
+ },
2443
+ "21428": {
2444
+ "content": "啋",
2445
+ "lstrip": false,
2446
+ "normalized": true,
2447
+ "rstrip": false,
2448
+ "single_word": false,
2449
+ "special": false
2450
+ },
2451
+ "21429": {
2452
+ "content": "揞",
2453
+ "lstrip": false,
2454
+ "normalized": true,
2455
+ "rstrip": false,
2456
+ "single_word": false,
2457
+ "special": false
2458
+ },
2459
+ "21430": {
2460
+ "content": "罉",
2461
+ "lstrip": false,
2462
+ "normalized": true,
2463
+ "rstrip": false,
2464
+ "single_word": false,
2465
+ "special": false
2466
+ },
2467
+ "21431": {
2468
+ "content": "媺",
2469
+ "lstrip": false,
2470
+ "normalized": true,
2471
+ "rstrip": false,
2472
+ "single_word": false,
2473
+ "special": false
2474
+ },
2475
+ "21432": {
2476
+ "content": "銼",
2477
+ "lstrip": false,
2478
+ "normalized": true,
2479
+ "rstrip": false,
2480
+ "single_word": false,
2481
+ "special": false
2482
+ },
2483
+ "21433": {
2484
+ "content": "癗",
2485
+ "lstrip": false,
2486
+ "normalized": true,
2487
+ "rstrip": false,
2488
+ "single_word": false,
2489
+ "special": false
2490
+ },
2491
+ "21434": {
2492
+ "content": "軻",
2493
+ "lstrip": false,
2494
+ "normalized": true,
2495
+ "rstrip": false,
2496
+ "single_word": false,
2497
+ "special": false
2498
+ },
2499
+ "21435": {
2500
+ "content": "瀡",
2501
+ "lstrip": false,
2502
+ "normalized": true,
2503
+ "rstrip": false,
2504
+ "single_word": false,
2505
+ "special": false
2506
+ },
2507
+ "21436": {
2508
+ "content": "鼴",
2509
+ "lstrip": false,
2510
+ "normalized": true,
2511
+ "rstrip": false,
2512
+ "single_word": false,
2513
+ "special": false
2514
+ },
2515
+ "21437": {
2516
+ "content": "暪",
2517
+ "lstrip": false,
2518
+ "normalized": true,
2519
+ "rstrip": false,
2520
+ "single_word": false,
2521
+ "special": false
2522
+ },
2523
+ "21438": {
2524
+ "content": "摱",
2525
+ "lstrip": false,
2526
+ "normalized": true,
2527
+ "rstrip": false,
2528
+ "single_word": false,
2529
+ "special": false
2530
+ },
2531
+ "21439": {
2532
+ "content": "厏",
2533
+ "lstrip": false,
2534
+ "normalized": true,
2535
+ "rstrip": false,
2536
+ "single_word": false,
2537
+ "special": false
2538
+ },
2539
+ "21440": {
2540
+ "content": "銫",
2541
+ "lstrip": false,
2542
+ "normalized": true,
2543
+ "rstrip": false,
2544
+ "single_word": false,
2545
+ "special": false
2546
+ },
2547
+ "21441": {
2548
+ "content": "綷",
2549
+ "lstrip": false,
2550
+ "normalized": true,
2551
+ "rstrip": false,
2552
+ "single_word": false,
2553
+ "special": false
2554
+ },
2555
+ "21442": {
2556
+ "content": "闓",
2557
+ "lstrip": false,
2558
+ "normalized": true,
2559
+ "rstrip": false,
2560
+ "single_word": false,
2561
+ "special": false
2562
+ },
2563
+ "21443": {
2564
+ "content": "湉",
2565
+ "lstrip": false,
2566
+ "normalized": true,
2567
+ "rstrip": false,
2568
+ "single_word": false,
2569
+ "special": false
2570
+ },
2571
+ "21444": {
2572
+ "content": "鏵",
2573
+ "lstrip": false,
2574
+ "normalized": true,
2575
+ "rstrip": false,
2576
+ "single_word": false,
2577
+ "special": false
2578
+ },
2579
+ "21445": {
2580
+ "content": "塲",
2581
+ "lstrip": false,
2582
+ "normalized": true,
2583
+ "rstrip": false,
2584
+ "single_word": false,
2585
+ "special": false
2586
+ },
2587
+ "21446": {
2588
+ "content": "彊",
2589
+ "lstrip": false,
2590
+ "normalized": true,
2591
+ "rstrip": false,
2592
+ "single_word": false,
2593
+ "special": false
2594
+ },
2595
+ "21447": {
2596
+ "content": "氬",
2597
+ "lstrip": false,
2598
+ "normalized": true,
2599
+ "rstrip": false,
2600
+ "single_word": false,
2601
+ "special": false
2602
+ },
2603
+ "21448": {
2604
+ "content": "繑",
2605
+ "lstrip": false,
2606
+ "normalized": true,
2607
+ "rstrip": false,
2608
+ "single_word": false,
2609
+ "special": false
2610
+ },
2611
+ "21449": {
2612
+ "content": "厓",
2613
+ "lstrip": false,
2614
+ "normalized": true,
2615
+ "rstrip": false,
2616
+ "single_word": false,
2617
+ "special": false
2618
+ },
2619
+ "21450": {
2620
+ "content": "釙",
2621
+ "lstrip": false,
2622
+ "normalized": true,
2623
+ "rstrip": false,
2624
+ "single_word": false,
2625
+ "special": false
2626
+ },
2627
+ "21451": {
2628
+ "content": "鳯",
2629
+ "lstrip": false,
2630
+ "normalized": true,
2631
+ "rstrip": false,
2632
+ "single_word": false,
2633
+ "special": false
2634
+ },
2635
+ "21452": {
2636
+ "content": "鑌",
2637
+ "lstrip": false,
2638
+ "normalized": true,
2639
+ "rstrip": false,
2640
+ "single_word": false,
2641
+ "special": false
2642
+ },
2643
+ "21453": {
2644
+ "content": "覲",
2645
+ "lstrip": false,
2646
+ "normalized": true,
2647
+ "rstrip": false,
2648
+ "single_word": false,
2649
+ "special": false
2650
+ },
2651
+ "21454": {
2652
+ "content": "凖",
2653
+ "lstrip": false,
2654
+ "normalized": true,
2655
+ "rstrip": false,
2656
+ "single_word": false,
2657
+ "special": false
2658
+ },
2659
+ "21455": {
2660
+ "content": "嚦",
2661
+ "lstrip": false,
2662
+ "normalized": true,
2663
+ "rstrip": false,
2664
+ "single_word": false,
2665
+ "special": false
2666
+ },
2667
+ "21456": {
2668
+ "content": "翕",
2669
+ "lstrip": false,
2670
+ "normalized": true,
2671
+ "rstrip": false,
2672
+ "single_word": false,
2673
+ "special": false
2674
+ },
2675
+ "21457": {
2676
+ "content": "甂",
2677
+ "lstrip": false,
2678
+ "normalized": true,
2679
+ "rstrip": false,
2680
+ "single_word": false,
2681
+ "special": false
2682
+ },
2683
+ "21458": {
2684
+ "content": "蓀",
2685
+ "lstrip": false,
2686
+ "normalized": true,
2687
+ "rstrip": false,
2688
+ "single_word": false,
2689
+ "special": false
2690
+ },
2691
+ "21459": {
2692
+ "content": "昰",
2693
+ "lstrip": false,
2694
+ "normalized": true,
2695
+ "rstrip": false,
2696
+ "single_word": false,
2697
+ "special": false
2698
+ },
2699
+ "21460": {
2700
+ "content": "疎",
2701
+ "lstrip": false,
2702
+ "normalized": true,
2703
+ "rstrip": false,
2704
+ "single_word": false,
2705
+ "special": false
2706
+ },
2707
+ "21461": {
2708
+ "content": "泂",
2709
+ "lstrip": false,
2710
+ "normalized": true,
2711
+ "rstrip": false,
2712
+ "single_word": false,
2713
+ "special": false
2714
+ },
2715
+ "21462": {
2716
+ "content": "銥",
2717
+ "lstrip": false,
2718
+ "normalized": true,
2719
+ "rstrip": false,
2720
+ "single_word": false,
2721
+ "special": false
2722
+ },
2723
+ "21463": {
2724
+ "content": "閆",
2725
+ "lstrip": false,
2726
+ "normalized": true,
2727
+ "rstrip": false,
2728
+ "single_word": false,
2729
+ "special": false
2730
+ },
2731
+ "21464": {
2732
+ "content": "魨",
2733
+ "lstrip": false,
2734
+ "normalized": true,
2735
+ "rstrip": false,
2736
+ "single_word": false,
2737
+ "special": false
2738
+ },
2739
+ "21465": {
2740
+ "content": "妺",
2741
+ "lstrip": false,
2742
+ "normalized": true,
2743
+ "rstrip": false,
2744
+ "single_word": false,
2745
+ "special": false
2746
+ },
2747
+ "21466": {
2748
+ "content": "訢",
2749
+ "lstrip": false,
2750
+ "normalized": true,
2751
+ "rstrip": false,
2752
+ "single_word": false,
2753
+ "special": false
2754
+ },
2755
+ "21467": {
2756
+ "content": "蠏",
2757
+ "lstrip": false,
2758
+ "normalized": true,
2759
+ "rstrip": false,
2760
+ "single_word": false,
2761
+ "special": false
2762
+ },
2763
+ "21468": {
2764
+ "content": "濶",
2765
+ "lstrip": false,
2766
+ "normalized": true,
2767
+ "rstrip": false,
2768
+ "single_word": false,
2769
+ "special": false
2770
+ },
2771
+ "21469": {
2772
+ "content": "衊",
2773
+ "lstrip": false,
2774
+ "normalized": true,
2775
+ "rstrip": false,
2776
+ "single_word": false,
2777
+ "special": false
2778
+ },
2779
+ "21470": {
2780
+ "content": "紇",
2781
+ "lstrip": false,
2782
+ "normalized": true,
2783
+ "rstrip": false,
2784
+ "single_word": false,
2785
+ "special": false
2786
+ },
2787
+ "21471": {
2788
+ "content": "僞",
2789
+ "lstrip": false,
2790
+ "normalized": true,
2791
+ "rstrip": false,
2792
+ "single_word": false,
2793
+ "special": false
2794
+ },
2795
+ "21472": {
2796
+ "content": "祆",
2797
+ "lstrip": false,
2798
+ "normalized": true,
2799
+ "rstrip": false,
2800
+ "single_word": false,
2801
+ "special": false
2802
+ },
2803
+ "21473": {
2804
+ "content": "謖",
2805
+ "lstrip": false,
2806
+ "normalized": true,
2807
+ "rstrip": false,
2808
+ "single_word": false,
2809
+ "special": false
2810
+ },
2811
+ "21474": {
2812
+ "content": "炑",
2813
+ "lstrip": false,
2814
+ "normalized": true,
2815
+ "rstrip": false,
2816
+ "single_word": false,
2817
+ "special": false
2818
+ },
2819
+ "21475": {
2820
+ "content": "樨",
2821
+ "lstrip": false,
2822
+ "normalized": true,
2823
+ "rstrip": false,
2824
+ "single_word": false,
2825
+ "special": false
2826
+ },
2827
+ "21476": {
2828
+ "content": "拕",
2829
+ "lstrip": false,
2830
+ "normalized": true,
2831
+ "rstrip": false,
2832
+ "single_word": false,
2833
+ "special": false
2834
+ },
2835
+ "21477": {
2836
+ "content": "憓",
2837
+ "lstrip": false,
2838
+ "normalized": true,
2839
+ "rstrip": false,
2840
+ "single_word": false,
2841
+ "special": false
2842
+ },
2843
+ "21478": {
2844
+ "content": "蘅",
2845
+ "lstrip": false,
2846
+ "normalized": true,
2847
+ "rstrip": false,
2848
+ "single_word": false,
2849
+ "special": false
2850
+ },
2851
+ "21479": {
2852
+ "content": "廻",
2853
+ "lstrip": false,
2854
+ "normalized": true,
2855
+ "rstrip": false,
2856
+ "single_word": false,
2857
+ "special": false
2858
+ },
2859
+ "21480": {
2860
+ "content": "馱",
2861
+ "lstrip": false,
2862
+ "normalized": true,
2863
+ "rstrip": false,
2864
+ "single_word": false,
2865
+ "special": false
2866
+ },
2867
+ "21481": {
2868
+ "content": "塹",
2869
+ "lstrip": false,
2870
+ "normalized": true,
2871
+ "rstrip": false,
2872
+ "single_word": false,
2873
+ "special": false
2874
+ },
2875
+ "21482": {
2876
+ "content": "啹",
2877
+ "lstrip": false,
2878
+ "normalized": true,
2879
+ "rstrip": false,
2880
+ "single_word": false,
2881
+ "special": false
2882
+ },
2883
+ "21483": {
2884
+ "content": "昃",
2885
+ "lstrip": false,
2886
+ "normalized": true,
2887
+ "rstrip": false,
2888
+ "single_word": false,
2889
+ "special": false
2890
+ },
2891
+ "21484": {
2892
+ "content": "屘",
2893
+ "lstrip": false,
2894
+ "normalized": true,
2895
+ "rstrip": false,
2896
+ "single_word": false,
2897
+ "special": false
2898
+ },
2899
+ "21485": {
2900
+ "content": "㴓",
2901
+ "lstrip": false,
2902
+ "normalized": true,
2903
+ "rstrip": false,
2904
+ "single_word": false,
2905
+ "special": false
2906
+ },
2907
+ "21486": {
2908
+ "content": "掗",
2909
+ "lstrip": false,
2910
+ "normalized": true,
2911
+ "rstrip": false,
2912
+ "single_word": false,
2913
+ "special": false
2914
+ },
2915
+ "21487": {
2916
+ "content": "騾",
2917
+ "lstrip": false,
2918
+ "normalized": true,
2919
+ "rstrip": false,
2920
+ "single_word": false,
2921
+ "special": false
2922
+ },
2923
+ "21488": {
2924
+ "content": "苺",
2925
+ "lstrip": false,
2926
+ "normalized": true,
2927
+ "rstrip": false,
2928
+ "single_word": false,
2929
+ "special": false
2930
+ },
2931
+ "21489": {
2932
+ "content": "駟",
2933
+ "lstrip": false,
2934
+ "normalized": true,
2935
+ "rstrip": false,
2936
+ "single_word": false,
2937
+ "special": false
2938
+ },
2939
+ "21490": {
2940
+ "content": "沚",
2941
+ "lstrip": false,
2942
+ "normalized": true,
2943
+ "rstrip": false,
2944
+ "single_word": false,
2945
+ "special": false
2946
+ },
2947
+ "21491": {
2948
+ "content": "豕",
2949
+ "lstrip": false,
2950
+ "normalized": true,
2951
+ "rstrip": false,
2952
+ "single_word": false,
2953
+ "special": false
2954
+ },
2955
+ "21492": {
2956
+ "content": "奭",
2957
+ "lstrip": false,
2958
+ "normalized": true,
2959
+ "rstrip": false,
2960
+ "single_word": false,
2961
+ "special": false
2962
+ },
2963
+ "21493": {
2964
+ "content": "㦸",
2965
+ "lstrip": false,
2966
+ "normalized": true,
2967
+ "rstrip": false,
2968
+ "single_word": false,
2969
+ "special": false
2970
+ },
2971
+ "21494": {
2972
+ "content": "琿",
2973
+ "lstrip": false,
2974
+ "normalized": true,
2975
+ "rstrip": false,
2976
+ "single_word": false,
2977
+ "special": false
2978
+ },
2979
+ "21495": {
2980
+ "content": "鬭",
2981
+ "lstrip": false,
2982
+ "normalized": true,
2983
+ "rstrip": false,
2984
+ "single_word": false,
2985
+ "special": false
2986
+ },
2987
+ "21496": {
2988
+ "content": "觜",
2989
+ "lstrip": false,
2990
+ "normalized": true,
2991
+ "rstrip": false,
2992
+ "single_word": false,
2993
+ "special": false
2994
+ },
2995
+ "21497": {
2996
+ "content": "軭",
2997
+ "lstrip": false,
2998
+ "normalized": true,
2999
+ "rstrip": false,
3000
+ "single_word": false,
3001
+ "special": false
3002
+ },
3003
+ "21498": {
3004
+ "content": "錒",
3005
+ "lstrip": false,
3006
+ "normalized": true,
3007
+ "rstrip": false,
3008
+ "single_word": false,
3009
+ "special": false
3010
+ },
3011
+ "21499": {
3012
+ "content": "錀",
3013
+ "lstrip": false,
3014
+ "normalized": true,
3015
+ "rstrip": false,
3016
+ "single_word": false,
3017
+ "special": false
3018
+ },
3019
+ "21500": {
3020
+ "content": "欏",
3021
+ "lstrip": false,
3022
+ "normalized": true,
3023
+ "rstrip": false,
3024
+ "single_word": false,
3025
+ "special": false
3026
+ },
3027
+ "21501": {
3028
+ "content": "仼",
3029
+ "lstrip": false,
3030
+ "normalized": true,
3031
+ "rstrip": false,
3032
+ "single_word": false,
3033
+ "special": false
3034
+ },
3035
+ "21502": {
3036
+ "content": "鏹",
3037
+ "lstrip": false,
3038
+ "normalized": true,
3039
+ "rstrip": false,
3040
+ "single_word": false,
3041
+ "special": false
3042
+ },
3043
+ "21503": {
3044
+ "content": "癆",
3045
+ "lstrip": false,
3046
+ "normalized": true,
3047
+ "rstrip": false,
3048
+ "single_word": false,
3049
+ "special": false
3050
+ },
3051
+ "21504": {
3052
+ "content": "囘",
3053
+ "lstrip": false,
3054
+ "normalized": true,
3055
+ "rstrip": false,
3056
+ "single_word": false,
3057
+ "special": false
3058
+ },
3059
+ "21505": {
3060
+ "content": "鐡",
3061
+ "lstrip": false,
3062
+ "normalized": true,
3063
+ "rstrip": false,
3064
+ "single_word": false,
3065
+ "special": false
3066
+ },
3067
+ "21506": {
3068
+ "content": "鼷",
3069
+ "lstrip": false,
3070
+ "normalized": true,
3071
+ "rstrip": false,
3072
+ "single_word": false,
3073
+ "special": false
3074
+ },
3075
+ "21507": {
3076
+ "content": "羕",
3077
+ "lstrip": false,
3078
+ "normalized": true,
3079
+ "rstrip": false,
3080
+ "single_word": false,
3081
+ "special": false
3082
+ },
3083
+ "21508": {
3084
+ "content": "璩",
3085
+ "lstrip": false,
3086
+ "normalized": true,
3087
+ "rstrip": false,
3088
+ "single_word": false,
3089
+ "special": false
3090
+ },
3091
+ "21509": {
3092
+ "content": "沊",
3093
+ "lstrip": false,
3094
+ "normalized": true,
3095
+ "rstrip": false,
3096
+ "single_word": false,
3097
+ "special": false
3098
+ },
3099
+ "21510": {
3100
+ "content": "鵞",
3101
+ "lstrip": false,
3102
+ "normalized": true,
3103
+ "rstrip": false,
3104
+ "single_word": false,
3105
+ "special": false
3106
+ },
3107
+ "21511": {
3108
+ "content": "塡",
3109
+ "lstrip": false,
3110
+ "normalized": true,
3111
+ "rstrip": false,
3112
+ "single_word": false,
3113
+ "special": false
3114
+ },
3115
+ "21512": {
3116
+ "content": "綣",
3117
+ "lstrip": false,
3118
+ "normalized": true,
3119
+ "rstrip": false,
3120
+ "single_word": false,
3121
+ "special": false
3122
+ },
3123
+ "21513": {
3124
+ "content": "龢",
3125
+ "lstrip": false,
3126
+ "normalized": true,
3127
+ "rstrip": false,
3128
+ "single_word": false,
3129
+ "special": false
3130
+ },
3131
+ "21514": {
3132
+ "content": "禰",
3133
+ "lstrip": false,
3134
+ "normalized": true,
3135
+ "rstrip": false,
3136
+ "single_word": false,
3137
+ "special": false
3138
+ },
3139
+ "21515": {
3140
+ "content": "鍔",
3141
+ "lstrip": false,
3142
+ "normalized": true,
3143
+ "rstrip": false,
3144
+ "single_word": false,
3145
+ "special": false
3146
+ },
3147
+ "21516": {
3148
+ "content": "櫟",
3149
+ "lstrip": false,
3150
+ "normalized": true,
3151
+ "rstrip": false,
3152
+ "single_word": false,
3153
+ "special": false
3154
+ },
3155
+ "21517": {
3156
+ "content": "緲",
3157
+ "lstrip": false,
3158
+ "normalized": true,
3159
+ "rstrip": false,
3160
+ "single_word": false,
3161
+ "special": false
3162
+ },
3163
+ "21518": {
3164
+ "content": "蕓",
3165
+ "lstrip": false,
3166
+ "normalized": true,
3167
+ "rstrip": false,
3168
+ "single_word": false,
3169
+ "special": false
3170
+ },
3171
+ "21519": {
3172
+ "content": "齲",
3173
+ "lstrip": false,
3174
+ "normalized": true,
3175
+ "rstrip": false,
3176
+ "single_word": false,
3177
+ "special": false
3178
+ },
3179
+ "21520": {
3180
+ "content": "尙",
3181
+ "lstrip": false,
3182
+ "normalized": true,
3183
+ "rstrip": false,
3184
+ "single_word": false,
3185
+ "special": false
3186
+ },
3187
+ "21521": {
3188
+ "content": "擧",
3189
+ "lstrip": false,
3190
+ "normalized": true,
3191
+ "rstrip": false,
3192
+ "single_word": false,
3193
+ "special": false
3194
+ },
3195
+ "21522": {
3196
+ "content": "勷",
3197
+ "lstrip": false,
3198
+ "normalized": true,
3199
+ "rstrip": false,
3200
+ "single_word": false,
3201
+ "special": false
3202
+ },
3203
+ "21523": {
3204
+ "content": "龑",
3205
+ "lstrip": false,
3206
+ "normalized": true,
3207
+ "rstrip": false,
3208
+ "single_word": false,
3209
+ "special": false
3210
+ },
3211
+ "21524": {
3212
+ "content": "挐",
3213
+ "lstrip": false,
3214
+ "normalized": true,
3215
+ "rstrip": false,
3216
+ "single_word": false,
3217
+ "special": false
3218
+ },
3219
+ "21525": {
3220
+ "content": "鉞",
3221
+ "lstrip": false,
3222
+ "normalized": true,
3223
+ "rstrip": false,
3224
+ "single_word": false,
3225
+ "special": false
3226
+ },
3227
+ "21526": {
3228
+ "content": "鸕",
3229
+ "lstrip": false,
3230
+ "normalized": true,
3231
+ "rstrip": false,
3232
+ "single_word": false,
3233
+ "special": false
3234
+ },
3235
+ "21527": {
3236
+ "content": "愃",
3237
+ "lstrip": false,
3238
+ "normalized": true,
3239
+ "rstrip": false,
3240
+ "single_word": false,
3241
+ "special": false
3242
+ },
3243
+ "21528": {
3244
+ "content": "昅",
3245
+ "lstrip": false,
3246
+ "normalized": true,
3247
+ "rstrip": false,
3248
+ "single_word": false,
3249
+ "special": false
3250
+ },
3251
+ "21529": {
3252
+ "content": "鎘",
3253
+ "lstrip": false,
3254
+ "normalized": true,
3255
+ "rstrip": false,
3256
+ "single_word": false,
3257
+ "special": false
3258
+ },
3259
+ "21530": {
3260
+ "content": "廡",
3261
+ "lstrip": false,
3262
+ "normalized": true,
3263
+ "rstrip": false,
3264
+ "single_word": false,
3265
+ "special": false
3266
+ },
3267
+ "21531": {
3268
+ "content": "詒",
3269
+ "lstrip": false,
3270
+ "normalized": true,
3271
+ "rstrip": false,
3272
+ "single_word": false,
3273
+ "special": false
3274
+ },
3275
+ "21532": {
3276
+ "content": "銻",
3277
+ "lstrip": false,
3278
+ "normalized": true,
3279
+ "rstrip": false,
3280
+ "single_word": false,
3281
+ "special": false
3282
+ },
3283
+ "21533": {
3284
+ "content": "咃",
3285
+ "lstrip": false,
3286
+ "normalized": true,
3287
+ "rstrip": false,
3288
+ "single_word": false,
3289
+ "special": false
3290
+ },
3291
+ "21534": {
3292
+ "content": "饉",
3293
+ "lstrip": false,
3294
+ "normalized": true,
3295
+ "rstrip": false,
3296
+ "single_word": false,
3297
+ "special": false
3298
+ },
3299
+ "21535": {
3300
+ "content": "瀦",
3301
+ "lstrip": false,
3302
+ "normalized": true,
3303
+ "rstrip": false,
3304
+ "single_word": false,
3305
+ "special": false
3306
+ },
3307
+ "21536": {
3308
+ "content": "姵",
3309
+ "lstrip": false,
3310
+ "normalized": true,
3311
+ "rstrip": false,
3312
+ "single_word": false,
3313
+ "special": false
3314
+ },
3315
+ "21537": {
3316
+ "content": "侘",
3317
+ "lstrip": false,
3318
+ "normalized": true,
3319
+ "rstrip": false,
3320
+ "single_word": false,
3321
+ "special": false
3322
+ },
3323
+ "21538": {
3324
+ "content": "垻",
3325
+ "lstrip": false,
3326
+ "normalized": true,
3327
+ "rstrip": false,
3328
+ "single_word": false,
3329
+ "special": false
3330
+ },
3331
+ "21539": {
3332
+ "content": "甑",
3333
+ "lstrip": false,
3334
+ "normalized": true,
3335
+ "rstrip": false,
3336
+ "single_word": false,
3337
+ "special": false
3338
+ },
3339
+ "21540": {
3340
+ "content": "猻",
3341
+ "lstrip": false,
3342
+ "normalized": true,
3343
+ "rstrip": false,
3344
+ "single_word": false,
3345
+ "special": false
3346
+ },
3347
+ "21541": {
3348
+ "content": "扤",
3349
+ "lstrip": false,
3350
+ "normalized": true,
3351
+ "rstrip": false,
3352
+ "single_word": false,
3353
+ "special": false
3354
+ },
3355
+ "21542": {
3356
+ "content": "禕",
3357
+ "lstrip": false,
3358
+ "normalized": true,
3359
+ "rstrip": false,
3360
+ "single_word": false,
3361
+ "special": false
3362
+ },
3363
+ "21543": {
3364
+ "content": "鬩",
3365
+ "lstrip": false,
3366
+ "normalized": true,
3367
+ "rstrip": false,
3368
+ "single_word": false,
3369
+ "special": false
3370
+ },
3371
+ "21544": {
3372
+ "content": "攆",
3373
+ "lstrip": false,
3374
+ "normalized": true,
3375
+ "rstrip": false,
3376
+ "single_word": false,
3377
+ "special": false
3378
+ },
3379
+ "21545": {
3380
+ "content": "鈹",
3381
+ "lstrip": false,
3382
+ "normalized": true,
3383
+ "rstrip": false,
3384
+ "single_word": false,
3385
+ "special": false
3386
+ },
3387
+ "21546": {
3388
+ "content": "酎",
3389
+ "lstrip": false,
3390
+ "normalized": true,
3391
+ "rstrip": false,
3392
+ "single_word": false,
3393
+ "special": false
3394
+ },
3395
+ "21547": {
3396
+ "content": "吿",
3397
+ "lstrip": false,
3398
+ "normalized": true,
3399
+ "rstrip": false,
3400
+ "single_word": false,
3401
+ "special": false
3402
+ },
3403
+ "21548": {
3404
+ "content": "鈷",
3405
+ "lstrip": false,
3406
+ "normalized": true,
3407
+ "rstrip": false,
3408
+ "single_word": false,
3409
+ "special": false
3410
+ },
3411
+ "21549": {
3412
+ "content": "扲",
3413
+ "lstrip": false,
3414
+ "normalized": true,
3415
+ "rstrip": false,
3416
+ "single_word": false,
3417
+ "special": false
3418
+ },
3419
+ "21550": {
3420
+ "content": "竪",
3421
+ "lstrip": false,
3422
+ "normalized": true,
3423
+ "rstrip": false,
3424
+ "single_word": false,
3425
+ "special": false
3426
+ },
3427
+ "21551": {
3428
+ "content": "柙",
3429
+ "lstrip": false,
3430
+ "normalized": true,
3431
+ "rstrip": false,
3432
+ "single_word": false,
3433
+ "special": false
3434
+ },
3435
+ "21552": {
3436
+ "content": "沔",
3437
+ "lstrip": false,
3438
+ "normalized": true,
3439
+ "rstrip": false,
3440
+ "single_word": false,
3441
+ "special": false
3442
+ },
3443
+ "21553": {
3444
+ "content": "缐",
3445
+ "lstrip": false,
3446
+ "normalized": true,
3447
+ "rstrip": false,
3448
+ "single_word": false,
3449
+ "special": false
3450
+ },
3451
+ "21554": {
3452
+ "content": "瓘",
3453
+ "lstrip": false,
3454
+ "normalized": true,
3455
+ "rstrip": false,
3456
+ "single_word": false,
3457
+ "special": false
3458
+ },
3459
+ "21555": {
3460
+ "content": "璘",
3461
+ "lstrip": false,
3462
+ "normalized": true,
3463
+ "rstrip": false,
3464
+ "single_word": false,
3465
+ "special": false
3466
+ },
3467
+ "21556": {
3468
+ "content": "璦",
3469
+ "lstrip": false,
3470
+ "normalized": true,
3471
+ "rstrip": false,
3472
+ "single_word": false,
3473
+ "special": false
3474
+ },
3475
+ "21557": {
3476
+ "content": "朏",
3477
+ "lstrip": false,
3478
+ "normalized": true,
3479
+ "rstrip": false,
3480
+ "single_word": false,
3481
+ "special": false
3482
+ },
3483
+ "21558": {
3484
+ "content": "暎",
3485
+ "lstrip": false,
3486
+ "normalized": true,
3487
+ "rstrip": false,
3488
+ "single_word": false,
3489
+ "special": false
3490
+ },
3491
+ "21559": {
3492
+ "content": "鶿",
3493
+ "lstrip": false,
3494
+ "normalized": true,
3495
+ "rstrip": false,
3496
+ "single_word": false,
3497
+ "special": false
3498
+ },
3499
+ "21560": {
3500
+ "content": "榘",
3501
+ "lstrip": false,
3502
+ "normalized": true,
3503
+ "rstrip": false,
3504
+ "single_word": false,
3505
+ "special": false
3506
+ },
3507
+ "21561": {
3508
+ "content": "飮",
3509
+ "lstrip": false,
3510
+ "normalized": true,
3511
+ "rstrip": false,
3512
+ "single_word": false,
3513
+ "special": false
3514
+ },
3515
+ "21562": {
3516
+ "content": "愨",
3517
+ "lstrip": false,
3518
+ "normalized": true,
3519
+ "rstrip": false,
3520
+ "single_word": false,
3521
+ "special": false
3522
+ },
3523
+ "21563": {
3524
+ "content": "舢",
3525
+ "lstrip": false,
3526
+ "normalized": true,
3527
+ "rstrip": false,
3528
+ "single_word": false,
3529
+ "special": false
3530
+ },
3531
+ "21564": {
3532
+ "content": "僳",
3533
+ "lstrip": false,
3534
+ "normalized": true,
3535
+ "rstrip": false,
3536
+ "single_word": false,
3537
+ "special": false
3538
+ },
3539
+ "21565": {
3540
+ "content": "咼",
3541
+ "lstrip": false,
3542
+ "normalized": true,
3543
+ "rstrip": false,
3544
+ "single_word": false,
3545
+ "special": false
3546
+ },
3547
+ "21566": {
3548
+ "content": "臏",
3549
+ "lstrip": false,
3550
+ "normalized": true,
3551
+ "rstrip": false,
3552
+ "single_word": false,
3553
+ "special": false
3554
+ },
3555
+ "21567": {
3556
+ "content": "鑭",
3557
+ "lstrip": false,
3558
+ "normalized": true,
3559
+ "rstrip": false,
3560
+ "single_word": false,
3561
+ "special": false
3562
+ },
3563
+ "21568": {
3564
+ "content": "砬",
3565
+ "lstrip": false,
3566
+ "normalized": true,
3567
+ "rstrip": false,
3568
+ "single_word": false,
3569
+ "special": false
3570
+ },
3571
+ "21569": {
3572
+ "content": "娸",
3573
+ "lstrip": false,
3574
+ "normalized": true,
3575
+ "rstrip": false,
3576
+ "single_word": false,
3577
+ "special": false
3578
+ },
3579
+ "21570": {
3580
+ "content": "硃",
3581
+ "lstrip": false,
3582
+ "normalized": true,
3583
+ "rstrip": false,
3584
+ "single_word": false,
3585
+ "special": false
3586
+ },
3587
+ "21571": {
3588
+ "content": "訃",
3589
+ "lstrip": false,
3590
+ "normalized": true,
3591
+ "rstrip": false,
3592
+ "single_word": false,
3593
+ "special": false
3594
+ },
3595
+ "21572": {
3596
+ "content": "朳",
3597
+ "lstrip": false,
3598
+ "normalized": true,
3599
+ "rstrip": false,
3600
+ "single_word": false,
3601
+ "special": false
3602
+ },
3603
+ "21573": {
3604
+ "content": "瑭",
3605
+ "lstrip": false,
3606
+ "normalized": true,
3607
+ "rstrip": false,
3608
+ "single_word": false,
3609
+ "special": false
3610
+ },
3611
+ "21574": {
3612
+ "content": "氚",
3613
+ "lstrip": false,
3614
+ "normalized": true,
3615
+ "rstrip": false,
3616
+ "single_word": false,
3617
+ "special": false
3618
+ },
3619
+ "21575": {
3620
+ "content": "綉",
3621
+ "lstrip": false,
3622
+ "normalized": true,
3623
+ "rstrip": false,
3624
+ "single_word": false,
3625
+ "special": false
3626
+ },
3627
+ "21576": {
3628
+ "content": "痲",
3629
+ "lstrip": false,
3630
+ "normalized": true,
3631
+ "rstrip": false,
3632
+ "single_word": false,
3633
+ "special": false
3634
+ },
3635
+ "21577": {
3636
+ "content": "麫",
3637
+ "lstrip": false,
3638
+ "normalized": true,
3639
+ "rstrip": false,
3640
+ "single_word": false,
3641
+ "special": false
3642
+ },
3643
+ "21578": {
3644
+ "content": "齶",
3645
+ "lstrip": false,
3646
+ "normalized": true,
3647
+ "rstrip": false,
3648
+ "single_word": false,
3649
+ "special": false
3650
+ },
3651
+ "21579": {
3652
+ "content": "蚧",
3653
+ "lstrip": false,
3654
+ "normalized": true,
3655
+ "rstrip": false,
3656
+ "single_word": false,
3657
+ "special": false
3658
+ },
3659
+ "21580": {
3660
+ "content": "谿",
3661
+ "lstrip": false,
3662
+ "normalized": true,
3663
+ "rstrip": false,
3664
+ "single_word": false,
3665
+ "special": false
3666
+ },
3667
+ "21581": {
3668
+ "content": "栱",
3669
+ "lstrip": false,
3670
+ "normalized": true,
3671
+ "rstrip": false,
3672
+ "single_word": false,
3673
+ "special": false
3674
+ },
3675
+ "21582": {
3676
+ "content": "旚",
3677
+ "lstrip": false,
3678
+ "normalized": true,
3679
+ "rstrip": false,
3680
+ "single_word": false,
3681
+ "special": false
3682
+ },
3683
+ "21583": {
3684
+ "content": "鱟",
3685
+ "lstrip": false,
3686
+ "normalized": true,
3687
+ "rstrip": false,
3688
+ "single_word": false,
3689
+ "special": false
3690
+ },
3691
+ "21584": {
3692
+ "content": "鉍",
3693
+ "lstrip": false,
3694
+ "normalized": true,
3695
+ "rstrip": false,
3696
+ "single_word": false,
3697
+ "special": false
3698
+ },
3699
+ "21585": {
3700
+ "content": "鏐",
3701
+ "lstrip": false,
3702
+ "normalized": true,
3703
+ "rstrip": false,
3704
+ "single_word": false,
3705
+ "special": false
3706
+ },
3707
+ "21586": {
3708
+ "content": "粢",
3709
+ "lstrip": false,
3710
+ "normalized": true,
3711
+ "rstrip": false,
3712
+ "single_word": false,
3713
+ "special": false
3714
+ },
3715
+ "21587": {
3716
+ "content": "縹",
3717
+ "lstrip": false,
3718
+ "normalized": true,
3719
+ "rstrip": false,
3720
+ "single_word": false,
3721
+ "special": false
3722
+ },
3723
+ "21588": {
3724
+ "content": "鉬",
3725
+ "lstrip": false,
3726
+ "normalized": true,
3727
+ "rstrip": false,
3728
+ "single_word": false,
3729
+ "special": false
3730
+ },
3731
+ "21589": {
3732
+ "content": "擗",
3733
+ "lstrip": false,
3734
+ "normalized": true,
3735
+ "rstrip": false,
3736
+ "single_word": false,
3737
+ "special": false
3738
+ },
3739
+ "21590": {
3740
+ "content": "鋯",
3741
+ "lstrip": false,
3742
+ "normalized": true,
3743
+ "rstrip": false,
3744
+ "single_word": false,
3745
+ "special": false
3746
+ },
3747
+ "21591": {
3748
+ "content": "闐",
3749
+ "lstrip": false,
3750
+ "normalized": true,
3751
+ "rstrip": false,
3752
+ "single_word": false,
3753
+ "special": false
3754
+ },
3755
+ "21592": {
3756
+ "content": "舨",
3757
+ "lstrip": false,
3758
+ "normalized": true,
3759
+ "rstrip": false,
3760
+ "single_word": false,
3761
+ "special": false
3762
+ },
3763
+ "21593": {
3764
+ "content": "艶",
3765
+ "lstrip": false,
3766
+ "normalized": true,
3767
+ "rstrip": false,
3768
+ "single_word": false,
3769
+ "special": false
3770
+ },
3771
+ "21594": {
3772
+ "content": "逄",
3773
+ "lstrip": false,
3774
+ "normalized": true,
3775
+ "rstrip": false,
3776
+ "single_word": false,
3777
+ "special": false
3778
+ },
3779
+ "21595": {
3780
+ "content": "龠",
3781
+ "lstrip": false,
3782
+ "normalized": true,
3783
+ "rstrip": false,
3784
+ "single_word": false,
3785
+ "special": false
3786
+ },
3787
+ "21596": {
3788
+ "content": "鮋",
3789
+ "lstrip": false,
3790
+ "normalized": true,
3791
+ "rstrip": false,
3792
+ "single_word": false,
3793
+ "special": false
3794
+ },
3795
+ "21597": {
3796
+ "content": "琚",
3797
+ "lstrip": false,
3798
+ "normalized": true,
3799
+ "rstrip": false,
3800
+ "single_word": false,
3801
+ "special": false
3802
+ },
3803
+ "21598": {
3804
+ "content": "駡",
3805
+ "lstrip": false,
3806
+ "normalized": true,
3807
+ "rstrip": false,
3808
+ "single_word": false,
3809
+ "special": false
3810
+ },
3811
+ "21599": {
3812
+ "content": "靱",
3813
+ "lstrip": false,
3814
+ "normalized": true,
3815
+ "rstrip": false,
3816
+ "single_word": false,
3817
+ "special": false
3818
+ },
3819
+ "21600": {
3820
+ "content": "牘",
3821
+ "lstrip": false,
3822
+ "normalized": true,
3823
+ "rstrip": false,
3824
+ "single_word": false,
3825
+ "special": false
3826
+ },
3827
+ "21601": {
3828
+ "content": "樋",
3829
+ "lstrip": false,
3830
+ "normalized": true,
3831
+ "rstrip": false,
3832
+ "single_word": false,
3833
+ "special": false
3834
+ },
3835
+ "21602": {
3836
+ "content": "鼇",
3837
+ "lstrip": false,
3838
+ "normalized": true,
3839
+ "rstrip": false,
3840
+ "single_word": false,
3841
+ "special": false
3842
+ },
3843
+ "21603": {
3844
+ "content": "慇",
3845
+ "lstrip": false,
3846
+ "normalized": true,
3847
+ "rstrip": false,
3848
+ "single_word": false,
3849
+ "special": false
3850
+ },
3851
+ "21604": {
3852
+ "content": "郃",
3853
+ "lstrip": false,
3854
+ "normalized": true,
3855
+ "rstrip": false,
3856
+ "single_word": false,
3857
+ "special": false
3858
+ },
3859
+ "21605": {
3860
+ "content": "筧",
3861
+ "lstrip": false,
3862
+ "normalized": true,
3863
+ "rstrip": false,
3864
+ "single_word": false,
3865
+ "special": false
3866
+ },
3867
+ "21606": {
3868
+ "content": "鷓",
3869
+ "lstrip": false,
3870
+ "normalized": true,
3871
+ "rstrip": false,
3872
+ "single_word": false,
3873
+ "special": false
3874
+ },
3875
+ "21607": {
3876
+ "content": "鴣",
3877
+ "lstrip": false,
3878
+ "normalized": true,
3879
+ "rstrip": false,
3880
+ "single_word": false,
3881
+ "special": false
3882
+ },
3883
+ "21608": {
3884
+ "content": "鶻",
3885
+ "lstrip": false,
3886
+ "normalized": true,
3887
+ "rstrip": false,
3888
+ "single_word": false,
3889
+ "special": false
3890
+ },
3891
+ "21609": {
3892
+ "content": "猢",
3893
+ "lstrip": false,
3894
+ "normalized": true,
3895
+ "rstrip": false,
3896
+ "single_word": false,
3897
+ "special": false
3898
+ },
3899
+ "21610": {
3900
+ "content": "癦",
3901
+ "lstrip": false,
3902
+ "normalized": true,
3903
+ "rstrip": false,
3904
+ "single_word": false,
3905
+ "special": false
3906
+ },
3907
+ "21611": {
3908
+ "content": "逑",
3909
+ "lstrip": false,
3910
+ "normalized": true,
3911
+ "rstrip": false,
3912
+ "single_word": false,
3913
+ "special": false
3914
+ },
3915
+ "21612": {
3916
+ "content": "囇",
3917
+ "lstrip": false,
3918
+ "normalized": true,
3919
+ "rstrip": false,
3920
+ "single_word": false,
3921
+ "special": false
3922
+ },
3923
+ "21613": {
3924
+ "content": "蘄",
3925
+ "lstrip": false,
3926
+ "normalized": true,
3927
+ "rstrip": false,
3928
+ "single_word": false,
3929
+ "special": false
3930
+ },
3931
+ "21614": {
3932
+ "content": "鋇",
3933
+ "lstrip": false,
3934
+ "normalized": true,
3935
+ "rstrip": false,
3936
+ "single_word": false,
3937
+ "special": false
3938
+ },
3939
+ "21615": {
3940
+ "content": "撣",
3941
+ "lstrip": false,
3942
+ "normalized": true,
3943
+ "rstrip": false,
3944
+ "single_word": false,
3945
+ "special": false
3946
+ },
3947
+ "21616": {
3948
+ "content": "氘",
3949
+ "lstrip": false,
3950
+ "normalized": true,
3951
+ "rstrip": false,
3952
+ "single_word": false,
3953
+ "special": false
3954
+ },
3955
+ "21617": {
3956
+ "content": "駙",
3957
+ "lstrip": false,
3958
+ "normalized": true,
3959
+ "rstrip": false,
3960
+ "single_word": false,
3961
+ "special": false
3962
+ },
3963
+ "21618": {
3964
+ "content": "椗",
3965
+ "lstrip": false,
3966
+ "normalized": true,
3967
+ "rstrip": false,
3968
+ "single_word": false,
3969
+ "special": false
3970
+ },
3971
+ "21619": {
3972
+ "content": "尢",
3973
+ "lstrip": false,
3974
+ "normalized": true,
3975
+ "rstrip": false,
3976
+ "single_word": false,
3977
+ "special": false
3978
+ },
3979
+ "21620": {
3980
+ "content": "㔷",
3981
+ "lstrip": false,
3982
+ "normalized": true,
3983
+ "rstrip": false,
3984
+ "single_word": false,
3985
+ "special": false
3986
+ },
3987
+ "21621": {
3988
+ "content": "炘",
3989
+ "lstrip": false,
3990
+ "normalized": true,
3991
+ "rstrip": false,
3992
+ "single_word": false,
3993
+ "special": false
3994
+ },
3995
+ "21622": {
3996
+ "content": "鴴",
3997
+ "lstrip": false,
3998
+ "normalized": true,
3999
+ "rstrip": false,
4000
+ "single_word": false,
4001
+ "special": false
4002
+ },
4003
+ "21623": {
4004
+ "content": "鯡",
4005
+ "lstrip": false,
4006
+ "normalized": true,
4007
+ "rstrip": false,
4008
+ "single_word": false,
4009
+ "special": false
4010
+ },
4011
+ "21624": {
4012
+ "content": "茘",
4013
+ "lstrip": false,
4014
+ "normalized": true,
4015
+ "rstrip": false,
4016
+ "single_word": false,
4017
+ "special": false
4018
+ },
4019
+ "21625": {
4020
+ "content": "灃",
4021
+ "lstrip": false,
4022
+ "normalized": true,
4023
+ "rstrip": false,
4024
+ "single_word": false,
4025
+ "special": false
4026
+ },
4027
+ "21626": {
4028
+ "content": "湞",
4029
+ "lstrip": false,
4030
+ "normalized": true,
4031
+ "rstrip": false,
4032
+ "single_word": false,
4033
+ "special": false
4034
+ },
4035
+ "21627": {
4036
+ "content": "漖",
4037
+ "lstrip": false,
4038
+ "normalized": true,
4039
+ "rstrip": false,
4040
+ "single_word": false,
4041
+ "special": false
4042
+ }
4043
+ },
4044
+ "clean_up_tokenization_spaces": true,
4045
+ "cls_token": "[CLS]",
4046
+ "do_lower_case": false,
4047
+ "mask_token": "[MASK]",
4048
+ "max_length": 512,
4049
+ "model_max_length": 512,
4050
+ "pad_to_multiple_of": null,
4051
+ "pad_token": "[PAD]",
4052
+ "pad_token_type_id": 0,
4053
+ "padding_side": "right",
4054
+ "sep_token": "[SEP]",
4055
+ "stride": 0,
4056
+ "strip_accents": null,
4057
+ "tokenize_chinese_chars": true,
4058
+ "tokenizer_class": "BertTokenizer",
4059
+ "truncation_side": "right",
4060
+ "truncation_strategy": "longest_first",
4061
+ "unk_token": "[UNK]"
4062
+ }
bert/bert_models.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bert-large-cantonese": {
3
+ "repo_id": "hon9kon9ize/bert-large-cantonese",
4
+ "files": [
5
+ "pytorch_model.bin"
6
+ ]
7
+ },
8
+ "deberta-v3-large": {
9
+ "repo_id": "microsoft/deberta-v3-large",
10
+ "files": [
11
+ "spm.model",
12
+ "pytorch_model.bin"
13
+ ]
14
+ }
15
+ }
bert/deberta-v3-large/.gitattributes ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.onnx filter=lfs diff=lfs merge=lfs -text
14
+ *.ot filter=lfs diff=lfs merge=lfs -text
15
+ *.parquet filter=lfs diff=lfs merge=lfs -text
16
+ *.pb filter=lfs diff=lfs merge=lfs -text
17
+ *.pt filter=lfs diff=lfs merge=lfs -text
18
+ *.pth filter=lfs diff=lfs merge=lfs -text
19
+ *.rar filter=lfs diff=lfs merge=lfs -text
20
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
22
+ *.tflite filter=lfs diff=lfs merge=lfs -text
23
+ *.tgz filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
bert/deberta-v3-large/README.md ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ tags:
4
+ - deberta
5
+ - deberta-v3
6
+ - fill-mask
7
+ thumbnail: https://huggingface.co/front/thumbnails/microsoft.png
8
+ license: mit
9
+ ---
10
+
11
+ ## DeBERTaV3: Improving DeBERTa using ELECTRA-Style Pre-Training with Gradient-Disentangled Embedding Sharing
12
+
13
+ [DeBERTa](https://arxiv.org/abs/2006.03654) improves the BERT and RoBERTa models using disentangled attention and enhanced mask decoder. With those two improvements, DeBERTa out perform RoBERTa on a majority of NLU tasks with 80GB training data.
14
+
15
+ In [DeBERTa V3](https://arxiv.org/abs/2111.09543), we further improved the efficiency of DeBERTa using ELECTRA-Style pre-training with Gradient Disentangled Embedding Sharing. Compared to DeBERTa, our V3 version significantly improves the model performance on downstream tasks. You can find more technique details about the new model from our [paper](https://arxiv.org/abs/2111.09543).
16
+
17
+ Please check the [official repository](https://github.com/microsoft/DeBERTa) for more implementation details and updates.
18
+
19
+ The DeBERTa V3 large model comes with 24 layers and a hidden size of 1024. It has 304M backbone parameters with a vocabulary containing 128K tokens which introduces 131M parameters in the Embedding layer. This model was trained using the 160GB data as DeBERTa V2.
20
+
21
+
22
+ #### Fine-tuning on NLU tasks
23
+
24
+ We present the dev results on SQuAD 2.0 and MNLI tasks.
25
+
26
+ | Model |Vocabulary(K)|Backbone #Params(M)| SQuAD 2.0(F1/EM) | MNLI-m/mm(ACC)|
27
+ |-------------------|----------|-------------------|-----------|----------|
28
+ | RoBERTa-large |50 |304 | 89.4/86.5 | 90.2 |
29
+ | XLNet-large |32 |- | 90.6/87.9 | 90.8 |
30
+ | DeBERTa-large |50 |- | 90.7/88.0 | 91.3 |
31
+ | **DeBERTa-v3-large**|128|304 | **91.5/89.0**| **91.8/91.9**|
32
+
33
+
34
+ #### Fine-tuning with HF transformers
35
+
36
+ ```bash
37
+ #!/bin/bash
38
+
39
+ cd transformers/examples/pytorch/text-classification/
40
+
41
+ pip install datasets
42
+ export TASK_NAME=mnli
43
+
44
+ output_dir="ds_results"
45
+
46
+ num_gpus=8
47
+
48
+ batch_size=8
49
+
50
+ python -m torch.distributed.launch --nproc_per_node=${num_gpus} \
51
+ run_glue.py \
52
+ --model_name_or_path microsoft/deberta-v3-large \
53
+ --task_name $TASK_NAME \
54
+ --do_train \
55
+ --do_eval \
56
+ --evaluation_strategy steps \
57
+ --max_seq_length 256 \
58
+ --warmup_steps 50 \
59
+ --per_device_train_batch_size ${batch_size} \
60
+ --learning_rate 6e-6 \
61
+ --num_train_epochs 2 \
62
+ --output_dir $output_dir \
63
+ --overwrite_output_dir \
64
+ --logging_steps 1000 \
65
+ --logging_dir $output_dir
66
+
67
+ ```
68
+
69
+ ### Citation
70
+
71
+ If you find DeBERTa useful for your work, please cite the following papers:
72
+
73
+ ``` latex
74
+ @misc{he2021debertav3,
75
+ title={DeBERTaV3: Improving DeBERTa using ELECTRA-Style Pre-Training with Gradient-Disentangled Embedding Sharing},
76
+ author={Pengcheng He and Jianfeng Gao and Weizhu Chen},
77
+ year={2021},
78
+ eprint={2111.09543},
79
+ archivePrefix={arXiv},
80
+ primaryClass={cs.CL}
81
+ }
82
+ ```
83
+
84
+ ``` latex
85
+ @inproceedings{
86
+ he2021deberta,
87
+ title={DEBERTA: DECODING-ENHANCED BERT WITH DISENTANGLED ATTENTION},
88
+ author={Pengcheng He and Xiaodong Liu and Jianfeng Gao and Weizhu Chen},
89
+ booktitle={International Conference on Learning Representations},
90
+ year={2021},
91
+ url={https://openreview.net/forum?id=XPZIaotutsD}
92
+ }
93
+ ```
bert/deberta-v3-large/config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "deberta-v2",
3
+ "attention_probs_dropout_prob": 0.1,
4
+ "hidden_act": "gelu",
5
+ "hidden_dropout_prob": 0.1,
6
+ "hidden_size": 1024,
7
+ "initializer_range": 0.02,
8
+ "intermediate_size": 4096,
9
+ "max_position_embeddings": 512,
10
+ "relative_attention": true,
11
+ "position_buckets": 256,
12
+ "norm_rel_ebd": "layer_norm",
13
+ "share_att_key": true,
14
+ "pos_att_type": "p2c|c2p",
15
+ "layer_norm_eps": 1e-7,
16
+ "max_relative_positions": -1,
17
+ "position_biased_input": false,
18
+ "num_attention_heads": 16,
19
+ "num_hidden_layers": 24,
20
+ "type_vocab_size": 0,
21
+ "vocab_size": 128100
22
+ }
bert/deberta-v3-large/generator_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "deberta-v2",
3
+ "attention_probs_dropout_prob": 0.1,
4
+ "hidden_act": "gelu",
5
+ "hidden_dropout_prob": 0.1,
6
+ "hidden_size": 1024,
7
+ "initializer_range": 0.02,
8
+ "intermediate_size": 4096,
9
+ "max_position_embeddings": 512,
10
+ "relative_attention": true,
11
+ "position_buckets": 256,
12
+ "norm_rel_ebd": "layer_norm",
13
+ "share_att_key": true,
14
+ "pos_att_type": "p2c|c2p",
15
+ "layer_norm_eps": 1e-7,
16
+ "max_relative_positions": -1,
17
+ "position_biased_input": false,
18
+ "num_attention_heads": 16,
19
+ "num_hidden_layers": 12,
20
+ "type_vocab_size": 0,
21
+ "vocab_size": 128100
22
+ }
bert/deberta-v3-large/spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
3
+ size 2464616
bert/deberta-v3-large/tokenizer_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "do_lower_case": false,
3
+ "vocab_type": "spm"
4
+ }
infer.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import onnxruntime as ort
3
+ from text import cantonese, english, cleaned_text_to_sequence
4
+
5
+ language_module_map = {"EN": english, "YUE": cantonese}
6
+
7
+ def clean_text(text, language):
8
+ language_module = language_module_map[language]
9
+ norm_text = language_module.text_normalize(text)
10
+ phones, tones, word2ph = language_module.g2p(norm_text)
11
+ return norm_text, phones, tones, word2ph
12
+
13
+
14
+ def convert_pad_shape(pad_shape):
15
+ layer = pad_shape[::-1]
16
+ pad_shape = [item for sublist in layer for item in sublist]
17
+ return pad_shape
18
+
19
+
20
+ def sequence_mask(length, max_length=None):
21
+ if max_length is None:
22
+ max_length = length.max()
23
+ x = np.arange(max_length, dtype=length.dtype)
24
+ return np.expand_dims(x, 0) < np.expand_dims(length, 1)
25
+
26
+
27
+ def generate_path(duration, mask):
28
+ """
29
+ duration: [b, 1, t_x]
30
+ mask: [b, 1, t_y, t_x]
31
+ """
32
+
33
+ b, _, t_y, t_x = mask.shape
34
+ cum_duration = np.cumsum(duration, -1)
35
+
36
+ cum_duration_flat = cum_duration.reshape(b * t_x)
37
+ path = sequence_mask(cum_duration_flat, t_y)
38
+ path = path.reshape(b, t_x, t_y)
39
+ path = path ^ np.pad(path, ((0, 0), (1, 0), (0, 0)))[:, :-1]
40
+ path = np.expand_dims(path, 1).transpose(0, 1, 3, 2)
41
+ return path
42
+
43
+
44
+ class OnnxInferenceSession:
45
+ def __init__(self, path, Providers=["CPUExecutionProvider"]):
46
+ self.enc = ort.InferenceSession(path["enc"], providers=Providers)
47
+ self.emb_g = ort.InferenceSession(path["emb_g"], providers=Providers)
48
+ self.dp = ort.InferenceSession(path["dp"], providers=Providers)
49
+ self.sdp = ort.InferenceSession(path["sdp"], providers=Providers)
50
+ self.flow = ort.InferenceSession(path["flow"], providers=Providers)
51
+ self.dec = ort.InferenceSession(path["dec"], providers=Providers)
52
+
53
+ def __call__(
54
+ self,
55
+ seq,
56
+ tone,
57
+ language,
58
+ bert_en,
59
+ bert_yue,
60
+ sid,
61
+ seed=114514,
62
+ seq_noise_scale=0.8,
63
+ sdp_noise_scale=0.6,
64
+ length_scale=1.0,
65
+ sdp_ratio=0.0,
66
+ ):
67
+ if seq.ndim == 1:
68
+ seq = np.expand_dims(seq, 0)
69
+ if tone.ndim == 1:
70
+ tone = np.expand_dims(tone, 0)
71
+ if language.ndim == 1:
72
+ language = np.expand_dims(language, 0)
73
+ assert (seq.ndim == 2, tone.ndim == 2, language.ndim == 2)
74
+ g = self.emb_g.run(
75
+ None,
76
+ {
77
+ "sid": sid.astype(np.int64),
78
+ },
79
+ )[0]
80
+ g = np.expand_dims(g, -1)
81
+
82
+ enc_rtn = self.enc.run(
83
+ None,
84
+ {
85
+ "x": seq.astype(np.int64),
86
+ "t": tone.astype(np.int64),
87
+ "language": language.astype(np.int64),
88
+ "bert_0": bert_en.astype(np.float32),
89
+ "bert_1": bert_yue.astype(np.float32),
90
+ "g": g.astype(np.float32),
91
+ },
92
+ )
93
+ x, m_p, logs_p, x_mask = enc_rtn[0], enc_rtn[1], enc_rtn[2], enc_rtn[3]
94
+ np.random.seed(seed)
95
+ zinput = np.random.randn(x.shape[0], 2, x.shape[2]) * sdp_noise_scale
96
+ logw = self.sdp.run(
97
+ None, {"x": x, "x_mask": x_mask,
98
+ "zin": zinput.astype(np.float32), "g": g}
99
+ )[0] * (sdp_ratio) + self.dp.run(None, {"x": x, "x_mask": x_mask, "g": g})[
100
+ 0
101
+ ] * (
102
+ 1 - sdp_ratio
103
+ )
104
+ w = np.exp(logw) * x_mask * length_scale
105
+ w_ceil = np.ceil(w)
106
+ y_lengths = np.clip(np.sum(w_ceil, (1, 2)), a_min=1.0, a_max=100000).astype(
107
+ np.int64
108
+ )
109
+ y_mask = np.expand_dims(sequence_mask(y_lengths, None), 1)
110
+ attn_mask = np.expand_dims(x_mask, 2) * np.expand_dims(y_mask, -1)
111
+ attn = generate_path(w_ceil, attn_mask)
112
+ m_p = np.matmul(attn.squeeze(1), m_p.transpose(0, 2, 1)).transpose(
113
+ 0, 2, 1
114
+ ) # [b, t', t], [b, t, d] -> [b, d, t']
115
+ logs_p = np.matmul(attn.squeeze(1), logs_p.transpose(0, 2, 1)).transpose(
116
+ 0, 2, 1
117
+ ) # [b, t', t], [b, t, d] -> [b, d, t']
118
+
119
+ z_p = (
120
+ m_p
121
+ + np.random.randn(m_p.shape[0], m_p.shape[1], m_p.shape[2])
122
+ * np.exp(logs_p)
123
+ * seq_noise_scale
124
+ )
125
+
126
+ z = self.flow.run(
127
+ None,
128
+ {
129
+ "z_p": z_p.astype(np.float32),
130
+ "y_mask": y_mask.astype(np.float32),
131
+ "g": g,
132
+ },
133
+ )[0]
134
+
135
+ return self.dec.run(None, {"z_in": z.astype(np.float32), "g": g})[0]
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gardio==4.39.0
2
+ pycantonese==3.4.0
3
+ cn2an==0.5.22
4
+ jieba==0.42.1
5
+ transformers[onnx]==4.42.4
6
+ torch==2.3.1
text/__init__.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from text.symbols import *
2
+
3
+ _symbol_to_id = {s: i for i, s in enumerate(symbols)}
4
+
5
+
6
+ def cleaned_text_to_sequence(cleaned_text, tones, language):
7
+ """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
8
+ Args:
9
+ text: string to convert to a sequence
10
+ Returns:
11
+ List of integers corresponding to the symbols in the text
12
+ """
13
+ phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
14
+ tone_start = language_tone_start_map[language]
15
+ tones = [i + tone_start for i in tones]
16
+ lang_id = language_id_map[language]
17
+ lang_ids = [lang_id for i in phones]
18
+ return phones, tones, lang_ids
19
+
20
+
21
+ def get_bert(norm_text, word2ph, language, device, style_text=None, style_weight=0.7):
22
+ from .english_bert_mock import get_bert_feature as en_bert
23
+ from .cantonese_bert import get_bert_feature as yue_bert
24
+
25
+ lang_bert_func_map = {"EN": en_bert, "YUE": yue_bert}
26
+ bert = lang_bert_func_map[language](
27
+ norm_text, word2ph, device, style_text, style_weight
28
+ )
29
+ return bert
30
+
text/cantonese.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from text.symbols import punctuation
2
+ import re
3
+ import unicodedata
4
+ import cn2an
5
+ import pycantonese
6
+ import jieba
7
+ import csv
8
+
9
+
10
+ jieba.load_userdict("./text/yue_dict.txt")
11
+
12
+ jyutping_dict = {}
13
+
14
+ with open("./text/jyutping.csv", "r", encoding="utf-8") as f:
15
+ for line in f:
16
+ line = line.strip()
17
+ if not line:
18
+ continue
19
+ word, jyutping = line.split(",")
20
+
21
+ if word not in jyutping_dict:
22
+ jyutping_dict[word] = [jyutping]
23
+ else:
24
+ jyutping_dict[word].append(jyutping)
25
+
26
+
27
+ def normalizer(x):
28
+ x = cn2an.transform(x, "an2cn")
29
+
30
+ return x
31
+
32
+
33
+ def word2jyutping(word):
34
+ jyutpings = [pycantonese.characters_to_jyutping(
35
+ w)[0][1] for w in word if unicodedata.name(w, "").startswith("CJK UNIFIED IDEOGRAPH")]
36
+
37
+ for i, j in enumerate(jyutpings):
38
+ if re.search(r"^(la|ga)[1-6]$", j):
39
+ # la1 -> laa1, ga1 -> gaa1
40
+ jyutpings[i] = jyutpings[i].replace('a', 'aa')
41
+
42
+ if None in jyutpings:
43
+ raise ValueError(f"Failed to convert {word} to jyutping: {jyutpings}")
44
+
45
+ return " ".join(jyutpings)
46
+
47
+
48
+ INITIALS = ["", "b", "c", "d", "f", "g", "gw", "h", "j",
49
+ "k", "kw", "l", "m", "n", "ng", "p", "s", "t", "w", "z"]
50
+ FINALS = ["aa", "aai", "aau", "aam", "aan", "aang", "aap", "aat", "aak", "ai", "au", "am", "an", "ang", "ap", "at", "ak", "e", "ei", "eu", "em", "eng", "ep", "ek", "i", "iu", "im",
51
+ "in", "ing", "ip", "it", "ik", "o", "oi", "ou", "on", "ong", "ot", "ok", "oe", "oeng", "oek", "eoi", "eon", "eot", "u", "ui", "un", "ung", "ut", "uk", "yu", "yun", "yut", "m", "ng"]
52
+
53
+ rep_map = {
54
+ ":": ",",
55
+ "︰": ",",
56
+ ";": ",",
57
+ ",": ",",
58
+ "﹐": ",",
59
+ "。": ".",
60
+ "!": "!",
61
+ "?": "?",
62
+ "﹖": "?",
63
+ "﹗": "!",
64
+ "\n": ".",
65
+ "·": ",",
66
+ "、": ",",
67
+ "丶": ",",
68
+ "...": "…",
69
+ "⋯": "…",
70
+ "$": ".",
71
+ "“": "'",
72
+ "”": "'",
73
+ '"': "'",
74
+ "‘": "'",
75
+ "’": "'",
76
+ "(": "'",
77
+ ")": "'",
78
+ "(": "'",
79
+ ")": "'",
80
+ "《": "'",
81
+ "》": "'",
82
+ "【": "'",
83
+ "】": "'",
84
+ "[": "'",
85
+ "]": "'",
86
+ "—": "-",
87
+ "~": "-",
88
+ "~": "-",
89
+ "「": "'",
90
+ "」": "'",
91
+ "_": "-",
92
+ }
93
+
94
+ replacement_chars = {
95
+ "\n": " ",
96
+ 'ㄧ': '一',
97
+ '—': '一',
98
+ '更': '更',
99
+ '不': '不',
100
+ '料': '料',
101
+ '聯': '聯',
102
+ '行': '行',
103
+ '利': '利',
104
+ '謢': '護',
105
+ '岀': '出',
106
+ '鎭': '鎮',
107
+ '戯': '戲',
108
+ '旣': '既',
109
+ '立': '立',
110
+ '來': '來',
111
+ '年': '年',
112
+ '㗇': '蝦',
113
+ }
114
+
115
+
116
+ def replace_punctuation(text):
117
+ pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
118
+ replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
119
+ replaced_text = "".join(
120
+ c for c in replaced_text if unicodedata.name(c, "").startswith("CJK UNIFIED IDEOGRAPH") or c in punctuation
121
+ )
122
+
123
+ return replaced_text
124
+
125
+
126
+ def replace_chars(text):
127
+ for k, v in replacement_chars.items():
128
+ text = text.replace(k, v)
129
+ return text
130
+
131
+
132
+ def word_segmentation(text):
133
+ words = jieba.cut(text)
134
+ return words
135
+
136
+
137
+ def text_normalize(text):
138
+ text = text.strip()
139
+ text = normalizer(text)
140
+ text = replace_punctuation(text)
141
+ text = replace_chars(text)
142
+ return text
143
+
144
+
145
+ def jyuping_to_initials_finals_tones(jyuping_syllables):
146
+ initials_finals = []
147
+ tones = []
148
+ word2ph = []
149
+
150
+ for syllable in jyuping_syllables:
151
+ if syllable in punctuation:
152
+ initials_finals.append(syllable)
153
+ tones.append(0)
154
+ word2ph.append(1) # Add 1 for punctuation
155
+ else:
156
+ init, final, tone = parse_jyutping(syllable)
157
+ initials_finals.extend([init, final])
158
+ tones.extend([tone, tone])
159
+ word2ph.append(2)
160
+
161
+ assert len(initials_finals) == len(tones)
162
+ return initials_finals, tones, word2ph
163
+
164
+
165
+ wordshk_juytping = {}
166
+
167
+ # with open("/notebooks/bert-vits2/Bert-VITS2-Cantonese/wordshk_juytping.csv", "r") as csv_file:
168
+ # csv_reader = csv.reader(csv_file, delimiter=',')
169
+
170
+ # for row in csv_reader:
171
+ # wordshk_juytping[text_normalize(row[0])] = row[1]
172
+
173
+
174
+ def get_jyutping(text):
175
+ if text in wordshk_juytping:
176
+ return wordshk_juytping[text].split(" ")
177
+
178
+ words = word_segmentation(text)
179
+ jyutping_array = []
180
+
181
+ for word in words:
182
+ if word in punctuation:
183
+ jyutping_array.append(word)
184
+ else:
185
+ jyutpings = ""
186
+
187
+ if word in jyutping_dict:
188
+ jyutpings = jyutping_dict[word][0]
189
+ else:
190
+ jyutpings = word2jyutping(word)
191
+
192
+ if 'la1' in jyutpings:
193
+ print(text, words, jyutpings)
194
+
195
+ # match multple jyutping eg: liu4 ge3, or single jyutping eg: liu4
196
+ if not re.search(r"^([a-z]+[1-6]+[ ]?)+$", jyutpings):
197
+ raise ValueError(
198
+ f"Failed to convert {word} to jyutping: {jyutpings}")
199
+
200
+ jyutping_array.extend(jyutpings.split(" "))
201
+
202
+ return jyutping_array
203
+
204
+
205
+ def get_bert_feature(text, word2ph):
206
+ from text import cantonese_bert
207
+
208
+ return cantonese_bert.get_bert_feature(text, word2ph)
209
+
210
+
211
+ def parse_jyutping(jyutping):
212
+ orig_jyutping = jyutping
213
+
214
+ if len(jyutping) < 2:
215
+ raise ValueError(f"Jyutping string too short: {jyutping}")
216
+ init = ""
217
+ if jyutping[0] == 'n' and jyutping[1] == 'g' and len(jyutping) == 3:
218
+ init = ""
219
+ elif jyutping[0] == 'm' and len(jyutping) == 2:
220
+ init = ""
221
+ elif jyutping[0] == 'n' and jyutping[1] == 'g':
222
+ init = 'ng'
223
+ jyutping = jyutping[2:]
224
+ elif jyutping[0] == 'g' and jyutping[1] == 'w':
225
+ init = 'gw'
226
+ jyutping = jyutping[2:]
227
+ elif jyutping[0] == 'k' and jyutping[1] == 'w':
228
+ init = 'kw'
229
+ jyutping = jyutping[2:]
230
+ elif jyutping[0] in 'bpmfdtnlgkhwzcsj':
231
+ init = jyutping[0]
232
+ jyutping = jyutping[1:]
233
+ else:
234
+ jyutping = jyutping
235
+ try:
236
+ tone = int(jyutping[-1])
237
+ jyutping = jyutping[:-1]
238
+ except:
239
+ raise ValueError(
240
+ f"Jyutping string does not end with a tone number, in {orig_jyutping}")
241
+ final = jyutping
242
+
243
+ assert init in INITIALS, f"Invalid initial: {init}, in {orig_jyutping}"
244
+
245
+ if final not in FINALS:
246
+ raise ValueError(f"Invalid final: {final}, in {orig_jyutping}")
247
+
248
+ return [init, final, tone]
249
+
250
+
251
+ def g2p(text):
252
+ word2ph = []
253
+ jyuping = get_jyutping(text)
254
+ phones, tones, word2ph = jyuping_to_initials_finals_tones(jyuping)
255
+ phones = ["_"] + phones + ["_"]
256
+ tones = [0] + tones + [0]
257
+ word2ph = [1] + word2ph + [1]
258
+ return phones, tones, word2ph
259
+
260
+
261
+ if __name__ == "__main__":
262
+ from text.cantonese_bert import get_bert_feature
263
+
264
+ # text = "Apple BB 你點解會咁柒㗎?我真係唔該晒你呀!123"
265
+ text = "佢邊係想辭工吖,跳下草裙舞想加人工之嘛。"
266
+ # text = "我個 app 嘅介紹文想由你寫,因為我唔知從一般用家角度要細緻到乜程度"
267
+ # text = "佢哋最叻咪就係去㗇人傷害人,得個殼咋!"
268
+ text = text_normalize(text)
269
+ print('normalized text', text)
270
+ phones, tones, word2ph = g2p(text)
271
+ print(phones, tones, word2ph)
272
+ bert = get_bert_feature(text, word2ph)
273
+ print(bert.shape)
text/cantonese_bert.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ import torch
4
+ from transformers import AutoModelForMaskedLM, AutoTokenizer
5
+
6
+ LOCAL_PATH = "./bert/bert-large-cantonese"
7
+
8
+ tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
9
+
10
+ models = dict()
11
+
12
+
13
+ def get_bert_feature(
14
+ text,
15
+ word2ph,
16
+ device="cpu",
17
+ style_text=None,
18
+ style_weight=0.7,
19
+ ):
20
+ if (
21
+ sys.platform == "darwin"
22
+ and torch.backends.mps.is_available()
23
+ and device == "cpu"
24
+ ):
25
+ device = "mps"
26
+ if not device:
27
+ device = "cuda"
28
+ if device not in models.keys():
29
+ models[device] = AutoModelForMaskedLM.from_pretrained(
30
+ LOCAL_PATH).to(device)
31
+ with torch.no_grad():
32
+ inputs = tokenizer(text, return_tensors="pt")
33
+ for i in inputs:
34
+ inputs[i] = inputs[i].to(device)
35
+ res = models[device](**inputs, output_hidden_states=True)
36
+ res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
37
+ if style_text:
38
+ style_inputs = tokenizer(style_text, return_tensors="pt")
39
+ for i in style_inputs:
40
+ style_inputs[i] = style_inputs[i].to(device)
41
+ style_res = models[device](
42
+ **style_inputs, output_hidden_states=True)
43
+ style_res = torch.cat(
44
+ style_res["hidden_states"][-3:-2], -1)[0].cpu()
45
+ style_res_mean = style_res.mean(0)
46
+ assert len(word2ph) == len(text) + \
47
+ 2, f"{len(word2ph)} != {len(text) + 2}, {word2ph}, {text}"
48
+ word2phone = word2ph
49
+ phone_level_feature = []
50
+ for i in range(len(word2phone)):
51
+ if style_text:
52
+ repeat_feature = (
53
+ res[i].repeat(word2phone[i], 1) * (1 - style_weight)
54
+ + style_res_mean.repeat(word2phone[i], 1) * style_weight
55
+ )
56
+ else:
57
+ repeat_feature = res[i].repeat(word2phone[i], 1)
58
+ phone_level_feature.append(repeat_feature)
59
+
60
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
61
+
62
+ return phone_level_feature.T
63
+
64
+
65
+ if __name__ == "__main__":
66
+ word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征
67
+ word2phone = [
68
+ 1,
69
+ 2,
70
+ 1,
71
+ 2,
72
+ 2,
73
+ 1,
74
+ 2,
75
+ 2,
76
+ 1,
77
+ 2,
78
+ 2,
79
+ 1,
80
+ 2,
81
+ 2,
82
+ 2,
83
+ 2,
84
+ 2,
85
+ 1,
86
+ 1,
87
+ 2,
88
+ 2,
89
+ 1,
90
+ 2,
91
+ 2,
92
+ 2,
93
+ 2,
94
+ 1,
95
+ 2,
96
+ 2,
97
+ 2,
98
+ 2,
99
+ 2,
100
+ 1,
101
+ 2,
102
+ 2,
103
+ 2,
104
+ 2,
105
+ 1,
106
+ ]
107
+
108
+ # 计算总帧数
109
+ total_frames = sum(word2phone)
110
+ print(word_level_feature.shape)
111
+ print(word2phone)
112
+ phone_level_feature = []
113
+ for i in range(len(word2phone)):
114
+ print(word_level_feature[i].shape)
115
+
116
+ # 对每个词重复word2phone[i]次
117
+ repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
118
+ phone_level_feature.append(repeat_feature)
119
+
120
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
121
+ print(phone_level_feature.shape) # torch.Size([36, 1024])
text/cleaner.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from text import cantonese, english, cleaned_text_to_sequence
2
+
3
+
4
+ language_module_map = {"EN": english, "YUE": cantonese}
5
+
6
+
7
+ def clean_text(text, language):
8
+ language_module = language_module_map[language]
9
+ norm_text = language_module.text_normalize(text)
10
+ phones, tones, word2ph = language_module.g2p(norm_text)
11
+ return norm_text, phones, tones, word2ph
12
+
13
+
14
+ def clean_text_bert(text, language):
15
+ language_module = language_module_map[language]
16
+ norm_text = language_module.text_normalize(text)
17
+ phones, tones, word2ph = language_module.g2p(norm_text)
18
+ bert = language_module.get_bert_feature(norm_text, word2ph)
19
+ return phones, tones, bert
20
+
21
+
22
+ def text_to_sequence(text, language):
23
+ norm_text, phones, tones, word2ph = clean_text(text, language)
24
+ return cleaned_text_to_sequence(phones, tones, language)
25
+
26
+
27
+ if __name__ == "__main__":
28
+ pass
text/english.py ADDED
@@ -0,0 +1,494 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import os
3
+ import re
4
+ from g2p_en import G2p
5
+ from transformers import DebertaV2Tokenizer
6
+
7
+ from text import symbols
8
+ from text.symbols import punctuation
9
+
10
+ current_file_path = os.path.dirname(__file__)
11
+ CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
12
+ CACHE_PATH = os.path.join(current_file_path, "cmudict_cache.pickle")
13
+ _g2p = G2p()
14
+ LOCAL_PATH = "./bert/deberta-v3-large"
15
+ tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH)
16
+
17
+ arpa = {
18
+ "AH0",
19
+ "S",
20
+ "AH1",
21
+ "EY2",
22
+ "AE2",
23
+ "EH0",
24
+ "OW2",
25
+ "UH0",
26
+ "NG",
27
+ "B",
28
+ "G",
29
+ "AY0",
30
+ "M",
31
+ "AA0",
32
+ "F",
33
+ "AO0",
34
+ "ER2",
35
+ "UH1",
36
+ "IY1",
37
+ "AH2",
38
+ "DH",
39
+ "IY0",
40
+ "EY1",
41
+ "IH0",
42
+ "K",
43
+ "N",
44
+ "W",
45
+ "IY2",
46
+ "T",
47
+ "AA1",
48
+ "ER1",
49
+ "EH2",
50
+ "OY0",
51
+ "UH2",
52
+ "UW1",
53
+ "Z",
54
+ "AW2",
55
+ "AW1",
56
+ "V",
57
+ "UW2",
58
+ "AA2",
59
+ "ER",
60
+ "AW0",
61
+ "UW0",
62
+ "R",
63
+ "OW1",
64
+ "EH1",
65
+ "ZH",
66
+ "AE0",
67
+ "IH2",
68
+ "IH",
69
+ "Y",
70
+ "JH",
71
+ "P",
72
+ "AY1",
73
+ "EY0",
74
+ "OY2",
75
+ "TH",
76
+ "HH",
77
+ "D",
78
+ "ER0",
79
+ "CH",
80
+ "AO1",
81
+ "AE1",
82
+ "AO2",
83
+ "OY1",
84
+ "AY2",
85
+ "IH1",
86
+ "OW0",
87
+ "L",
88
+ "SH",
89
+ }
90
+
91
+
92
+ def post_replace_ph(ph):
93
+ rep_map = {
94
+ ":": ",",
95
+ ";": ",",
96
+ ",": ",",
97
+ "。": ".",
98
+ "!": "!",
99
+ "?": "?",
100
+ "\n": ".",
101
+ "·": ",",
102
+ "、": ",",
103
+ "…": "...",
104
+ "···": "...",
105
+ "・・・": "...",
106
+ "v": "V",
107
+ }
108
+ if ph in rep_map.keys():
109
+ ph = rep_map[ph]
110
+ if ph in symbols:
111
+ return ph
112
+ if ph not in symbols:
113
+ ph = "UNK"
114
+ return ph
115
+
116
+
117
+ rep_map = {
118
+ ":": ",",
119
+ ";": ",",
120
+ ",": ",",
121
+ "。": ".",
122
+ "!": "!",
123
+ "?": "?",
124
+ "\n": ".",
125
+ ".": ".",
126
+ "…": "...",
127
+ "···": "...",
128
+ "・・・": "...",
129
+ "·": ",",
130
+ "・": ",",
131
+ "、": ",",
132
+ "$": ".",
133
+ "“": "'",
134
+ "”": "'",
135
+ '"': "'",
136
+ "‘": "'",
137
+ "’": "'",
138
+ "(": "'",
139
+ ")": "'",
140
+ "(": "'",
141
+ ")": "'",
142
+ "《": "'",
143
+ "》": "'",
144
+ "【": "'",
145
+ "】": "'",
146
+ "[": "'",
147
+ "]": "'",
148
+ "—": "-",
149
+ "−": "-",
150
+ "~": "-",
151
+ "~": "-",
152
+ "「": "'",
153
+ "」": "'",
154
+ }
155
+
156
+
157
+ def replace_punctuation(text):
158
+ pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
159
+
160
+ replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
161
+
162
+ # replaced_text = re.sub(
163
+ # r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF\u3005"
164
+ # + "".join(punctuation)
165
+ # + r"]+",
166
+ # "",
167
+ # replaced_text,
168
+ # )
169
+
170
+ return replaced_text
171
+
172
+
173
+ def read_dict():
174
+ g2p_dict = {}
175
+ start_line = 49
176
+ with open(CMU_DICT_PATH) as f:
177
+ line = f.readline()
178
+ line_index = 1
179
+ while line:
180
+ if line_index >= start_line:
181
+ line = line.strip()
182
+ word_split = line.split(" ")
183
+ word = word_split[0]
184
+
185
+ syllable_split = word_split[1].split(" - ")
186
+ g2p_dict[word] = []
187
+ for syllable in syllable_split:
188
+ phone_split = syllable.split(" ")
189
+ g2p_dict[word].append(phone_split)
190
+
191
+ line_index = line_index + 1
192
+ line = f.readline()
193
+
194
+ return g2p_dict
195
+
196
+
197
+ def cache_dict(g2p_dict, file_path):
198
+ with open(file_path, "wb") as pickle_file:
199
+ pickle.dump(g2p_dict, pickle_file)
200
+
201
+
202
+ def get_dict():
203
+ if os.path.exists(CACHE_PATH):
204
+ with open(CACHE_PATH, "rb") as pickle_file:
205
+ g2p_dict = pickle.load(pickle_file)
206
+ else:
207
+ g2p_dict = read_dict()
208
+ cache_dict(g2p_dict, CACHE_PATH)
209
+
210
+ return g2p_dict
211
+
212
+
213
+ eng_dict = get_dict()
214
+
215
+
216
+ def refine_ph(phn):
217
+ tone = 0
218
+ if re.search(r"\d$", phn):
219
+ tone = int(phn[-1]) + 1
220
+ phn = phn[:-1]
221
+ else:
222
+ tone = 3
223
+ return phn.lower(), tone
224
+
225
+
226
+ def refine_syllables(syllables):
227
+ tones = []
228
+ phonemes = []
229
+ for phn_list in syllables:
230
+ for i in range(len(phn_list)):
231
+ phn = phn_list[i]
232
+ phn, tone = refine_ph(phn)
233
+ phonemes.append(phn)
234
+ tones.append(tone)
235
+ return phonemes, tones
236
+
237
+
238
+ import inflect
239
+
240
+ _inflect = inflect.engine()
241
+ _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
242
+ _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
243
+ _pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
244
+ _dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
245
+ _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
246
+ _number_re = re.compile(r"[0-9]+")
247
+
248
+ # List of (regular expression, replacement) pairs for abbreviations:
249
+ _abbreviations = [
250
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
251
+ for x in [
252
+ ("mrs", "misess"),
253
+ ("mr", "mister"),
254
+ ("dr", "doctor"),
255
+ ("st", "saint"),
256
+ ("co", "company"),
257
+ ("jr", "junior"),
258
+ ("maj", "major"),
259
+ ("gen", "general"),
260
+ ("drs", "doctors"),
261
+ ("rev", "reverend"),
262
+ ("lt", "lieutenant"),
263
+ ("hon", "honorable"),
264
+ ("sgt", "sergeant"),
265
+ ("capt", "captain"),
266
+ ("esq", "esquire"),
267
+ ("ltd", "limited"),
268
+ ("col", "colonel"),
269
+ ("ft", "fort"),
270
+ ]
271
+ ]
272
+
273
+
274
+ # List of (ipa, lazy ipa) pairs:
275
+ _lazy_ipa = [
276
+ (re.compile("%s" % x[0]), x[1])
277
+ for x in [
278
+ ("r", "ɹ"),
279
+ ("æ", "e"),
280
+ ("ɑ", "a"),
281
+ ("ɔ", "o"),
282
+ ("ð", "z"),
283
+ ("θ", "s"),
284
+ ("ɛ", "e"),
285
+ ("ɪ", "i"),
286
+ ("ʊ", "u"),
287
+ ("ʒ", "ʥ"),
288
+ ("ʤ", "ʥ"),
289
+ ("ˈ", "↓"),
290
+ ]
291
+ ]
292
+
293
+ # List of (ipa, lazy ipa2) pairs:
294
+ _lazy_ipa2 = [
295
+ (re.compile("%s" % x[0]), x[1])
296
+ for x in [
297
+ ("r", "ɹ"),
298
+ ("ð", "z"),
299
+ ("θ", "s"),
300
+ ("ʒ", "ʑ"),
301
+ ("ʤ", "dʑ"),
302
+ ("ˈ", "↓"),
303
+ ]
304
+ ]
305
+
306
+ # List of (ipa, ipa2) pairs
307
+ _ipa_to_ipa2 = [
308
+ (re.compile("%s" % x[0]), x[1]) for x in [("r", "ɹ"), ("ʤ", "dʒ"), ("ʧ", "tʃ")]
309
+ ]
310
+
311
+
312
+ def _expand_dollars(m):
313
+ match = m.group(1)
314
+ parts = match.split(".")
315
+ if len(parts) > 2:
316
+ return match + " dollars" # Unexpected format
317
+ dollars = int(parts[0]) if parts[0] else 0
318
+ cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
319
+ if dollars and cents:
320
+ dollar_unit = "dollar" if dollars == 1 else "dollars"
321
+ cent_unit = "cent" if cents == 1 else "cents"
322
+ return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
323
+ elif dollars:
324
+ dollar_unit = "dollar" if dollars == 1 else "dollars"
325
+ return "%s %s" % (dollars, dollar_unit)
326
+ elif cents:
327
+ cent_unit = "cent" if cents == 1 else "cents"
328
+ return "%s %s" % (cents, cent_unit)
329
+ else:
330
+ return "zero dollars"
331
+
332
+
333
+ def _remove_commas(m):
334
+ return m.group(1).replace(",", "")
335
+
336
+
337
+ def _expand_ordinal(m):
338
+ return _inflect.number_to_words(m.group(0))
339
+
340
+
341
+ def _expand_number(m):
342
+ num = int(m.group(0))
343
+ if num > 1000 and num < 3000:
344
+ if num == 2000:
345
+ return "two thousand"
346
+ elif num > 2000 and num < 2010:
347
+ return "two thousand " + _inflect.number_to_words(num % 100)
348
+ elif num % 100 == 0:
349
+ return _inflect.number_to_words(num // 100) + " hundred"
350
+ else:
351
+ return _inflect.number_to_words(
352
+ num, andword="", zero="oh", group=2
353
+ ).replace(", ", " ")
354
+ else:
355
+ return _inflect.number_to_words(num, andword="")
356
+
357
+
358
+ def _expand_decimal_point(m):
359
+ return m.group(1).replace(".", " point ")
360
+
361
+
362
+ def normalize_numbers(text):
363
+ text = re.sub(_comma_number_re, _remove_commas, text)
364
+ text = re.sub(_pounds_re, r"\1 pounds", text)
365
+ text = re.sub(_dollars_re, _expand_dollars, text)
366
+ text = re.sub(_decimal_number_re, _expand_decimal_point, text)
367
+ text = re.sub(_ordinal_re, _expand_ordinal, text)
368
+ text = re.sub(_number_re, _expand_number, text)
369
+ return text
370
+
371
+
372
+ def text_normalize(text):
373
+ text = normalize_numbers(text)
374
+ text = replace_punctuation(text)
375
+ text = re.sub(r"([,;.\?\!])([\w])", r"\1 \2", text)
376
+ return text
377
+
378
+
379
+ def distribute_phone(n_phone, n_word):
380
+ phones_per_word = [0] * n_word
381
+ for task in range(n_phone):
382
+ min_tasks = min(phones_per_word)
383
+ min_index = phones_per_word.index(min_tasks)
384
+ phones_per_word[min_index] += 1
385
+ return phones_per_word
386
+
387
+
388
+ def sep_text(text):
389
+ words = re.split(r"([,;.\?\!\s+])", text)
390
+ words = [word for word in words if word.strip() != ""]
391
+ return words
392
+
393
+
394
+ def text_to_words(text):
395
+ tokens = tokenizer.tokenize(text)
396
+ words = []
397
+ for idx, t in enumerate(tokens):
398
+ if t.startswith("▁"):
399
+ words.append([t[1:]])
400
+ else:
401
+ if t in punctuation:
402
+ if idx == len(tokens) - 1:
403
+ words.append([f"{t}"])
404
+ else:
405
+ if (
406
+ not tokens[idx + 1].startswith("▁")
407
+ and tokens[idx + 1] not in punctuation
408
+ ):
409
+ if idx == 0:
410
+ words.append([])
411
+ words[-1].append(f"{t}")
412
+ else:
413
+ words.append([f"{t}"])
414
+ else:
415
+ if idx == 0:
416
+ words.append([])
417
+ words[-1].append(f"{t}")
418
+ return words
419
+
420
+
421
+ def g2p(text):
422
+ phones = []
423
+ tones = []
424
+ phone_len = []
425
+ # words = sep_text(text)
426
+ # tokens = [tokenizer.tokenize(i) for i in words]
427
+ words = text_to_words(text)
428
+
429
+ for word in words:
430
+ temp_phones, temp_tones = [], []
431
+ if len(word) > 1:
432
+ if "'" in word:
433
+ word = ["".join(word)]
434
+ for w in word:
435
+ if w in punctuation:
436
+ temp_phones.append(w)
437
+ temp_tones.append(0)
438
+ continue
439
+ if w.upper() in eng_dict:
440
+ phns, tns = refine_syllables(eng_dict[w.upper()])
441
+ temp_phones += [post_replace_ph(i) for i in phns]
442
+ temp_tones += tns
443
+ # w2ph.append(len(phns))
444
+ else:
445
+ phone_list = list(filter(lambda p: p != " ", _g2p(w)))
446
+ phns = []
447
+ tns = []
448
+ for ph in phone_list:
449
+ if ph in arpa:
450
+ ph, tn = refine_ph(ph)
451
+ phns.append(ph)
452
+ tns.append(tn)
453
+ else:
454
+ phns.append(ph)
455
+ tns.append(0)
456
+ temp_phones += [post_replace_ph(i) for i in phns]
457
+ temp_tones += tns
458
+ phones += temp_phones
459
+ tones += temp_tones
460
+ phone_len.append(len(temp_phones))
461
+ # phones = [post_replace_ph(i) for i in phones]
462
+
463
+ word2ph = []
464
+ for token, pl in zip(words, phone_len):
465
+ word_len = len(token)
466
+
467
+ aaa = distribute_phone(pl, word_len)
468
+ word2ph += aaa
469
+
470
+ phones = ["_"] + phones + ["_"]
471
+ tones = [0] + tones + [0]
472
+ word2ph = [1] + word2ph + [1]
473
+ assert len(phones) == len(tones), text
474
+ assert len(phones) == sum(word2ph), text
475
+
476
+ return phones, tones, word2ph
477
+
478
+
479
+ def get_bert_feature(text, word2ph):
480
+ from text import english_bert_mock
481
+
482
+ return english_bert_mock.get_bert_feature(text, word2ph)
483
+
484
+
485
+ if __name__ == "__main__":
486
+ # print(get_dict())
487
+ # print(eng_word_to_phoneme("hello"))
488
+ print(g2p("In this paper, we propose 1 DSPGAN, a GAN-based universal vocoder."))
489
+ # all_phones = set()
490
+ # for k, syllables in eng_dict.items():
491
+ # for group in syllables:
492
+ # for ph in group:
493
+ # all_phones.add(ph)
494
+ # print(all_phones)
text/english_bert_mock.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ import torch
4
+ from transformers import DebertaV2Model, DebertaV2Tokenizer
5
+
6
+ LOCAL_PATH = "./bert/deberta-v3-large"
7
+
8
+ tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH)
9
+
10
+ models = dict()
11
+
12
+
13
+ def get_bert_feature(
14
+ text,
15
+ word2ph,
16
+ device="cpu",
17
+ style_text=None,
18
+ style_weight=0.7,
19
+ ):
20
+ if (
21
+ sys.platform == "darwin"
22
+ and torch.backends.mps.is_available()
23
+ and device == "cpu"
24
+ ):
25
+ device = "mps"
26
+ if not device:
27
+ device = "cuda"
28
+ if device not in models.keys():
29
+ models[device] = DebertaV2Model.from_pretrained(LOCAL_PATH).to(device)
30
+ with torch.no_grad():
31
+ inputs = tokenizer(text, return_tensors="pt")
32
+ for i in inputs:
33
+ inputs[i] = inputs[i].to(device)
34
+ res = models[device](**inputs, output_hidden_states=True)
35
+ res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
36
+ if style_text:
37
+ style_inputs = tokenizer(style_text, return_tensors="pt")
38
+ for i in style_inputs:
39
+ style_inputs[i] = style_inputs[i].to(device)
40
+ style_res = models[device](**style_inputs, output_hidden_states=True)
41
+ style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].cpu()
42
+ style_res_mean = style_res.mean(0)
43
+ assert len(word2ph) == res.shape[0], (text, res.shape[0], len(word2ph))
44
+ word2phone = word2ph
45
+ phone_level_feature = []
46
+ for i in range(len(word2phone)):
47
+ if style_text:
48
+ repeat_feature = (
49
+ res[i].repeat(word2phone[i], 1) * (1 - style_weight)
50
+ + style_res_mean.repeat(word2phone[i], 1) * style_weight
51
+ )
52
+ else:
53
+ repeat_feature = res[i].repeat(word2phone[i], 1)
54
+ phone_level_feature.append(repeat_feature)
55
+
56
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
57
+
58
+ return phone_level_feature.T
text/jyutping.csv ADDED
The diff for this file is too large to render. See raw diff
 
text/symbols.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2
+ pu_symbols = punctuation + ["SP", "UNK"]
3
+ pad = "_"
4
+
5
+ # English
6
+ en_symbols = [
7
+ "aa",
8
+ "ae",
9
+ "ah",
10
+ "ao",
11
+ "aw",
12
+ "ay",
13
+ "b",
14
+ "ch",
15
+ "d",
16
+ "dh",
17
+ "eh",
18
+ "er",
19
+ "ey",
20
+ "f",
21
+ "g",
22
+ "hh",
23
+ "ih",
24
+ "iy",
25
+ "jh",
26
+ "k",
27
+ "l",
28
+ "m",
29
+ "n",
30
+ "ng",
31
+ "ow",
32
+ "oy",
33
+ "p",
34
+ "r",
35
+ "s",
36
+ "sh",
37
+ "t",
38
+ "th",
39
+ "uh",
40
+ "uw",
41
+ "V",
42
+ "w",
43
+ "y",
44
+ "z",
45
+ "zh",
46
+ ]
47
+ num_en_tones = 4
48
+
49
+ # Cantonese
50
+ yue_symbols = [
51
+ "",
52
+ "aa",
53
+ "aai",
54
+ "aak",
55
+ "aam",
56
+ "aan",
57
+ "aang",
58
+ "aap",
59
+ "aat",
60
+ "aau",
61
+ "ai",
62
+ "ak",
63
+ "am",
64
+ "an",
65
+ "ang",
66
+ "ap",
67
+ "at",
68
+ "au",
69
+ "b",
70
+ "c",
71
+ "d",
72
+ "e",
73
+ "ei",
74
+ "ek",
75
+ "em",
76
+ "eng",
77
+ "eoi",
78
+ "eon",
79
+ "eot",
80
+ "ep",
81
+ "eu",
82
+ "f",
83
+ "g",
84
+ "gw",
85
+ "h",
86
+ "i",
87
+ "ik",
88
+ "im",
89
+ "in",
90
+ "ing",
91
+ "ip",
92
+ "it",
93
+ "iu",
94
+ "j",
95
+ "k",
96
+ "kw",
97
+ "l",
98
+ "m",
99
+ "m",
100
+ "n",
101
+ "ng",
102
+ "ng",
103
+ "o",
104
+ "oe",
105
+ "oek",
106
+ "oeng",
107
+ "oi",
108
+ "ok",
109
+ "on",
110
+ "ong",
111
+ "ot",
112
+ "ou",
113
+ "p",
114
+ "s",
115
+ "t",
116
+ "u",
117
+ "ui",
118
+ "uk",
119
+ "un",
120
+ "ung",
121
+ "ut",
122
+ "w",
123
+ "yu",
124
+ "yun",
125
+ "yut",
126
+ "z"
127
+ ]
128
+
129
+ num_yue_tones = 7
130
+
131
+ # combine all symbols
132
+ normal_symbols = sorted(
133
+ set(en_symbols + yue_symbols))
134
+ symbols = [pad] + normal_symbols + pu_symbols
135
+ sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
136
+
137
+ # combine all tones
138
+ num_tones = num_en_tones + num_yue_tones
139
+
140
+ # language maps
141
+ language_id_map = {"EN": 0, "YUE": 1}
142
+ num_languages = len(language_id_map.keys())
143
+
144
+ language_tone_start_map = {
145
+ "EN": 0,
146
+ "YUE": num_en_tones,
147
+ }
148
+
149
+ if __name__ == "__main__":
150
+ a = set(yue_symbols)
151
+ b = set(en_symbols)
152
+ print(sorted(a & b))
text/yue_dict.txt ADDED
The diff for this file is too large to render. See raw diff