shibing624 commited on
Commit
6038c95
·
verified ·
1 Parent(s): 35f153e

Delete text

Browse files
text/__init__.py DELETED
@@ -1,15 +0,0 @@
1
- from text.symbols import *
2
-
3
-
4
- _symbol_to_id = {s: i for i, s in enumerate(symbols)}
5
-
6
- def cleaned_text_to_sequence(cleaned_text):
7
- '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
8
- Args:
9
- text: string to convert to a sequence
10
- Returns:
11
- List of integers corresponding to the symbols in the text
12
- '''
13
- phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
14
- return phones
15
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text/chinese.py DELETED
@@ -1,172 +0,0 @@
1
- import os
2
- import pdb
3
- import re
4
-
5
- import cn2an
6
- from pypinyin import lazy_pinyin, Style
7
-
8
- import sys
9
-
10
- sys.path.append("/data/docker/liujing04/gpt-vits/gpt-vits-master")
11
-
12
- from text.symbols import punctuation
13
- from text.tone_sandhi import ToneSandhi
14
-
15
- current_file_path = os.path.dirname(__file__)
16
- pinyin_to_symbol_map = {
17
- line.split("\t")[0]: line.strip().split("\t")[1]
18
- for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
19
- }
20
-
21
- import jieba_fast.posseg as psg
22
-
23
-
24
- rep_map = {
25
- ":": ",",
26
- ";": ",",
27
- ",": ",",
28
- "。": ".",
29
- "!": "!",
30
- "?": "?",
31
- "\n": ".",
32
- "·": ",",
33
- "、": ",",
34
- "...": "…",
35
- "$": ".",
36
- "/": ",",
37
- "—": "-",
38
- }
39
-
40
- tone_modifier = ToneSandhi()
41
-
42
-
43
- def replace_punctuation(text):
44
- text = text.replace("嗯", "恩").replace("呣", "母")
45
- pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
46
-
47
- replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
48
-
49
- replaced_text = re.sub(
50
- r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
51
- )
52
-
53
- return replaced_text
54
-
55
-
56
- def g2p(text):
57
- pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
58
- sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
59
- phones, word2ph = _g2p(sentences)
60
- return phones, word2ph
61
-
62
-
63
- def _get_initials_finals(word):
64
- initials = []
65
- finals = []
66
- orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
67
- orig_finals = lazy_pinyin(
68
- word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
69
- )
70
- for c, v in zip(orig_initials, orig_finals):
71
- initials.append(c)
72
- finals.append(v)
73
- return initials, finals
74
-
75
-
76
- def _g2p(segments):
77
- phones_list = []
78
- word2ph = []
79
- for seg in segments:
80
- pinyins = []
81
- # Replace all English words in the sentence
82
- seg = re.sub("[a-zA-Z]+", "", seg)
83
- seg_cut = psg.lcut(seg)
84
- initials = []
85
- finals = []
86
- seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
87
- for word, pos in seg_cut:
88
- if pos == "eng":
89
- continue
90
- sub_initials, sub_finals = _get_initials_finals(word)
91
- sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
92
- initials.append(sub_initials)
93
- finals.append(sub_finals)
94
-
95
- # assert len(sub_initials) == len(sub_finals) == len(word)
96
- initials = sum(initials, [])
97
- finals = sum(finals, [])
98
- #
99
- for c, v in zip(initials, finals):
100
- raw_pinyin = c + v
101
- # NOTE: post process for pypinyin outputs
102
- # we discriminate i, ii and iii
103
- if c == v:
104
- assert c in punctuation
105
- phone = [c]
106
- word2ph.append(1)
107
- else:
108
- v_without_tone = v[:-1]
109
- tone = v[-1]
110
-
111
- pinyin = c + v_without_tone
112
- assert tone in "12345"
113
-
114
- if c:
115
- # 多音节
116
- v_rep_map = {
117
- "uei": "ui",
118
- "iou": "iu",
119
- "uen": "un",
120
- }
121
- if v_without_tone in v_rep_map.keys():
122
- pinyin = c + v_rep_map[v_without_tone]
123
- else:
124
- # 单音节
125
- pinyin_rep_map = {
126
- "ing": "ying",
127
- "i": "yi",
128
- "in": "yin",
129
- "u": "wu",
130
- }
131
- if pinyin in pinyin_rep_map.keys():
132
- pinyin = pinyin_rep_map[pinyin]
133
- else:
134
- single_rep_map = {
135
- "v": "yu",
136
- "e": "e",
137
- "i": "y",
138
- "u": "w",
139
- }
140
- if pinyin[0] in single_rep_map.keys():
141
- pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
142
-
143
- assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
144
- new_c, new_v = pinyin_to_symbol_map[pinyin].split(" ")
145
- new_v = new_v + tone
146
- phone = [new_c, new_v]
147
- word2ph.append(len(phone))
148
-
149
- phones_list += phone
150
- return phones_list, word2ph
151
-
152
-
153
- def text_normalize(text):
154
- numbers = re.findall(r"\d+(?:\.?\d+)?", text)
155
- for number in numbers:
156
- text = text.replace(number, cn2an.an2cn(number), 1)
157
- text = replace_punctuation(text)
158
-
159
- return text
160
-
161
-
162
- if __name__ == "__main__":
163
- text = "啊——但是《原神》是由,米哈\游自主,研发的一款全.新开放世界.冒险游戏"
164
- text = "呣呣呣~就是…大人的鼹鼠党吧?"
165
- text = "你好"
166
- text = text_normalize(text)
167
- print(g2p(text))
168
-
169
-
170
- # # 示例用法
171
- # text = "这是一个示例文本:,你好!这是一个测试..."
172
- # print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text/cleaner.py DELETED
@@ -1,55 +0,0 @@
1
- from text import chinese, japanese, cleaned_text_to_sequence, symbols, english
2
-
3
- language_module_map = {"zh": chinese, "ja": japanese, "en": english}
4
- special = [
5
- ("%", "zh", "SP"),
6
- ("¥", "zh", "SP2"),
7
- ("^", "zh", "SP3"),
8
- # ('@', 'zh', "SP4")#不搞鬼畜了,和第二版保持一致吧
9
- ]
10
-
11
-
12
- def clean_text(text, language):
13
- for special_s, special_l, target_symbol in special:
14
- if special_s in text and language == special_l:
15
- return clean_special(text, language, special_s, target_symbol)
16
- language_module = language_module_map[language]
17
- norm_text = language_module.text_normalize(text)
18
- if language == "zh":
19
- phones, word2ph = language_module.g2p(norm_text)
20
- assert len(phones) == sum(word2ph)
21
- assert len(norm_text) == len(word2ph)
22
- else:
23
- phones = language_module.g2p(norm_text)
24
- word2ph = None
25
-
26
- for ph in phones:
27
- assert ph in symbols
28
- return phones, word2ph, norm_text
29
-
30
-
31
- def clean_special(text, language, special_s, target_symbol):
32
- """
33
- 特殊静音段sp符号处理
34
- """
35
- text = text.replace(special_s, ",")
36
- language_module = language_module_map[language]
37
- norm_text = language_module.text_normalize(text)
38
- phones = language_module.g2p(norm_text)
39
- new_ph = []
40
- for ph in phones:
41
- assert ph in symbols
42
- if ph == ",":
43
- new_ph.append(target_symbol)
44
- else:
45
- new_ph.append(ph)
46
- return new_ph
47
-
48
-
49
- def text_to_sequence(text, language):
50
- phones = clean_text(text)
51
- return cleaned_text_to_sequence(phones)
52
-
53
-
54
- if __name__ == "__main__":
55
- print(clean_text("你好%啊啊啊额、还是到付红四方。", "zh"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text/cmudict.rep DELETED
The diff for this file is too large to render. See raw diff
 
text/cmudict_cache.pickle DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b9b21b20325471934ba92f2e4a5976989e7d920caa32e7a286eacb027d197949
3
- size 6212655
 
 
 
 
text/english.py DELETED
@@ -1,179 +0,0 @@
1
- import pickle
2
- import os
3
- import re
4
- from g2p_en import G2p
5
-
6
- from string import punctuation
7
-
8
- from text import symbols
9
-
10
- current_file_path = os.path.dirname(__file__)
11
- CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
12
- CACHE_PATH = os.path.join(current_file_path, "cmudict_cache.pickle")
13
- _g2p = G2p()
14
-
15
- arpa = {
16
- "AH0",
17
- "S",
18
- "AH1",
19
- "EY2",
20
- "AE2",
21
- "EH0",
22
- "OW2",
23
- "UH0",
24
- "NG",
25
- "B",
26
- "G",
27
- "AY0",
28
- "M",
29
- "AA0",
30
- "F",
31
- "AO0",
32
- "ER2",
33
- "UH1",
34
- "IY1",
35
- "AH2",
36
- "DH",
37
- "IY0",
38
- "EY1",
39
- "IH0",
40
- "K",
41
- "N",
42
- "W",
43
- "IY2",
44
- "T",
45
- "AA1",
46
- "ER1",
47
- "EH2",
48
- "OY0",
49
- "UH2",
50
- "UW1",
51
- "Z",
52
- "AW2",
53
- "AW1",
54
- "V",
55
- "UW2",
56
- "AA2",
57
- "ER",
58
- "AW0",
59
- "UW0",
60
- "R",
61
- "OW1",
62
- "EH1",
63
- "ZH",
64
- "AE0",
65
- "IH2",
66
- "IH",
67
- "Y",
68
- "JH",
69
- "P",
70
- "AY1",
71
- "EY0",
72
- "OY2",
73
- "TH",
74
- "HH",
75
- "D",
76
- "ER0",
77
- "CH",
78
- "AO1",
79
- "AE1",
80
- "AO2",
81
- "OY1",
82
- "AY2",
83
- "IH1",
84
- "OW0",
85
- "L",
86
- "SH",
87
- }
88
-
89
-
90
- def replace_phs(phs):
91
- rep_map = {";": ",", ":": ",", "'": "-", '"': "-"}
92
- phs_new = []
93
- for ph in phs:
94
- if ph in symbols:
95
- phs_new.append(ph)
96
- elif ph in rep_map.keys():
97
- phs_new.append(rep_map[ph])
98
- else:
99
- print("ph not in symbols: ", ph)
100
- return phs_new
101
-
102
-
103
- def read_dict():
104
- g2p_dict = {}
105
- start_line = 49
106
- with open(CMU_DICT_PATH) as f:
107
- line = f.readline()
108
- line_index = 1
109
- while line:
110
- if line_index >= start_line:
111
- line = line.strip()
112
- word_split = line.split(" ")
113
- word = word_split[0]
114
-
115
- syllable_split = word_split[1].split(" - ")
116
- g2p_dict[word] = []
117
- for syllable in syllable_split:
118
- phone_split = syllable.split(" ")
119
- g2p_dict[word].append(phone_split)
120
-
121
- line_index = line_index + 1
122
- line = f.readline()
123
-
124
- return g2p_dict
125
-
126
-
127
- def cache_dict(g2p_dict, file_path):
128
- with open(file_path, "wb") as pickle_file:
129
- pickle.dump(g2p_dict, pickle_file)
130
-
131
-
132
- def get_dict():
133
- if os.path.exists(CACHE_PATH):
134
- with open(CACHE_PATH, "rb") as pickle_file:
135
- g2p_dict = pickle.load(pickle_file)
136
- else:
137
- g2p_dict = read_dict()
138
- cache_dict(g2p_dict, CACHE_PATH)
139
-
140
- return g2p_dict
141
-
142
-
143
- eng_dict = get_dict()
144
-
145
-
146
- def text_normalize(text):
147
- # todo: eng text normalize
148
- return text.replace(";", ",")
149
-
150
-
151
- def g2p(text):
152
- phones = []
153
- words = re.split(r"([,;.\-\?\!\s+])", text)
154
- for w in words:
155
- if w.upper() in eng_dict:
156
- phns = eng_dict[w.upper()]
157
- for ph in phns:
158
- phones += ph
159
- else:
160
- phone_list = list(filter(lambda p: p != " ", _g2p(w)))
161
- for ph in phone_list:
162
- if ph in arpa:
163
- phones.append(ph)
164
- else:
165
- phones.append(ph)
166
-
167
- return replace_phs(phones)
168
-
169
-
170
- if __name__ == "__main__":
171
- # print(get_dict())
172
- print(g2p("hello"))
173
- print(g2p("In this; paper, we propose 1 DSPGAN, a GAN-based universal vocoder."))
174
- # all_phones = set()
175
- # for k, syllables in eng_dict.items():
176
- # for group in syllables:
177
- # for ph in group:
178
- # all_phones.add(ph)
179
- # print(all_phones)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text/japanese.py DELETED
@@ -1,106 +0,0 @@
1
- # modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py
2
- import re
3
- import sys
4
-
5
- import pyopenjtalk
6
-
7
- from text import symbols
8
-
9
- # Regular expression matching Japanese without punctuation marks:
10
- _japanese_characters = re.compile(
11
- r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
12
- )
13
-
14
- # Regular expression matching non-Japanese characters or punctuation marks:
15
- _japanese_marks = re.compile(
16
- r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
17
- )
18
-
19
- # List of (symbol, Japanese) pairs for marks:
20
- _symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("%", "パーセント")]]
21
-
22
-
23
- # List of (consonant, sokuon) pairs:
24
- _real_sokuon = [
25
- (re.compile("%s" % x[0]), x[1])
26
- for x in [
27
- (r"Q([↑↓]*[kg])", r"k#\1"),
28
- (r"Q([↑↓]*[tdjʧ])", r"t#\1"),
29
- (r"Q([↑↓]*[sʃ])", r"s\1"),
30
- (r"Q([↑↓]*[pb])", r"p#\1"),
31
- ]
32
- ]
33
-
34
- # List of (consonant, hatsuon) pairs:
35
- _real_hatsuon = [
36
- (re.compile("%s" % x[0]), x[1])
37
- for x in [
38
- (r"N([↑↓]*[pbm])", r"m\1"),
39
- (r"N([↑↓]*[ʧʥj])", r"n^\1"),
40
- (r"N([↑↓]*[tdn])", r"n\1"),
41
- (r"N([↑↓]*[kg])", r"ŋ\1"),
42
- ]
43
- ]
44
-
45
-
46
- def post_replace_ph(ph):
47
- rep_map = {
48
- ":": ",",
49
- ";": ",",
50
- ",": ",",
51
- "。": ".",
52
- "!": "!",
53
- "?": "?",
54
- "\n": ".",
55
- "·": ",",
56
- "、": ",",
57
- "...": "…",
58
- }
59
- if ph in rep_map.keys():
60
- ph = rep_map[ph]
61
- if ph in symbols:
62
- return ph
63
- if ph not in symbols:
64
- ph = "UNK"
65
- return ph
66
-
67
-
68
- def symbols_to_japanese(text):
69
- for regex, replacement in _symbols_to_japanese:
70
- text = re.sub(regex, replacement, text)
71
- return text
72
-
73
-
74
- def preprocess_jap(text):
75
- """Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html"""
76
- text = symbols_to_japanese(text)
77
- sentences = re.split(_japanese_marks, text)
78
- marks = re.findall(_japanese_marks, text)
79
- text = []
80
- for i, sentence in enumerate(sentences):
81
- if re.match(_japanese_characters, sentence):
82
- p = pyopenjtalk.g2p(sentence)
83
- text += p.split(" ")
84
-
85
- if i < len(marks):
86
- text += [marks[i].replace(" ", "")]
87
- return text
88
-
89
-
90
- def text_normalize(text):
91
- # todo: jap text normalize
92
- return text
93
-
94
-
95
- def g2p(norm_text):
96
- phones = preprocess_jap(norm_text)
97
- phones = [post_replace_ph(i) for i in phones]
98
- # todo: implement tones and word2ph
99
- return phones
100
-
101
-
102
- if __name__ == "__main__":
103
- for line in open("../../../Downloads/transcript_utf8.txt").readlines():
104
- text = line.split(":")[1]
105
- phones = g2p(text)
106
- print(phones)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text/opencpop-strict.txt DELETED
@@ -1,429 +0,0 @@
1
- a AA a
2
- ai AA ai
3
- an AA an
4
- ang AA ang
5
- ao AA ao
6
- ba b a
7
- bai b ai
8
- ban b an
9
- bang b ang
10
- bao b ao
11
- bei b ei
12
- ben b en
13
- beng b eng
14
- bi b i
15
- bian b ian
16
- biao b iao
17
- bie b ie
18
- bin b in
19
- bing b ing
20
- bo b o
21
- bu b u
22
- ca c a
23
- cai c ai
24
- can c an
25
- cang c ang
26
- cao c ao
27
- ce c e
28
- cei c ei
29
- cen c en
30
- ceng c eng
31
- cha ch a
32
- chai ch ai
33
- chan ch an
34
- chang ch ang
35
- chao ch ao
36
- che ch e
37
- chen ch en
38
- cheng ch eng
39
- chi ch ir
40
- chong ch ong
41
- chou ch ou
42
- chu ch u
43
- chua ch ua
44
- chuai ch uai
45
- chuan ch uan
46
- chuang ch uang
47
- chui ch ui
48
- chun ch un
49
- chuo ch uo
50
- ci c i0
51
- cong c ong
52
- cou c ou
53
- cu c u
54
- cuan c uan
55
- cui c ui
56
- cun c un
57
- cuo c uo
58
- da d a
59
- dai d ai
60
- dan d an
61
- dang d ang
62
- dao d ao
63
- de d e
64
- dei d ei
65
- den d en
66
- deng d eng
67
- di d i
68
- dia d ia
69
- dian d ian
70
- diao d iao
71
- die d ie
72
- ding d ing
73
- diu d iu
74
- dong d ong
75
- dou d ou
76
- du d u
77
- duan d uan
78
- dui d ui
79
- dun d un
80
- duo d uo
81
- e EE e
82
- ei EE ei
83
- en EE en
84
- eng EE eng
85
- er EE er
86
- fa f a
87
- fan f an
88
- fang f ang
89
- fei f ei
90
- fen f en
91
- feng f eng
92
- fo f o
93
- fou f ou
94
- fu f u
95
- ga g a
96
- gai g ai
97
- gan g an
98
- gang g ang
99
- gao g ao
100
- ge g e
101
- gei g ei
102
- gen g en
103
- geng g eng
104
- gong g ong
105
- gou g ou
106
- gu g u
107
- gua g ua
108
- guai g uai
109
- guan g uan
110
- guang g uang
111
- gui g ui
112
- gun g un
113
- guo g uo
114
- ha h a
115
- hai h ai
116
- han h an
117
- hang h ang
118
- hao h ao
119
- he h e
120
- hei h ei
121
- hen h en
122
- heng h eng
123
- hong h ong
124
- hou h ou
125
- hu h u
126
- hua h ua
127
- huai h uai
128
- huan h uan
129
- huang h uang
130
- hui h ui
131
- hun h un
132
- huo h uo
133
- ji j i
134
- jia j ia
135
- jian j ian
136
- jiang j iang
137
- jiao j iao
138
- jie j ie
139
- jin j in
140
- jing j ing
141
- jiong j iong
142
- jiu j iu
143
- ju j v
144
- jv j v
145
- juan j van
146
- jvan j van
147
- jue j ve
148
- jve j ve
149
- jun j vn
150
- jvn j vn
151
- ka k a
152
- kai k ai
153
- kan k an
154
- kang k ang
155
- kao k ao
156
- ke k e
157
- kei k ei
158
- ken k en
159
- keng k eng
160
- kong k ong
161
- kou k ou
162
- ku k u
163
- kua k ua
164
- kuai k uai
165
- kuan k uan
166
- kuang k uang
167
- kui k ui
168
- kun k un
169
- kuo k uo
170
- la l a
171
- lai l ai
172
- lan l an
173
- lang l ang
174
- lao l ao
175
- le l e
176
- lei l ei
177
- leng l eng
178
- li l i
179
- lia l ia
180
- lian l ian
181
- liang l iang
182
- liao l iao
183
- lie l ie
184
- lin l in
185
- ling l ing
186
- liu l iu
187
- lo l o
188
- long l ong
189
- lou l ou
190
- lu l u
191
- luan l uan
192
- lun l un
193
- luo l uo
194
- lv l v
195
- lve l ve
196
- ma m a
197
- mai m ai
198
- man m an
199
- mang m ang
200
- mao m ao
201
- me m e
202
- mei m ei
203
- men m en
204
- meng m eng
205
- mi m i
206
- mian m ian
207
- miao m iao
208
- mie m ie
209
- min m in
210
- ming m ing
211
- miu m iu
212
- mo m o
213
- mou m ou
214
- mu m u
215
- na n a
216
- nai n ai
217
- nan n an
218
- nang n ang
219
- nao n ao
220
- ne n e
221
- nei n ei
222
- nen n en
223
- neng n eng
224
- ni n i
225
- nian n ian
226
- niang n iang
227
- niao n iao
228
- nie n ie
229
- nin n in
230
- ning n ing
231
- niu n iu
232
- nong n ong
233
- nou n ou
234
- nu n u
235
- nuan n uan
236
- nun n un
237
- nuo n uo
238
- nv n v
239
- nve n ve
240
- o OO o
241
- ou OO ou
242
- pa p a
243
- pai p ai
244
- pan p an
245
- pang p ang
246
- pao p ao
247
- pei p ei
248
- pen p en
249
- peng p eng
250
- pi p i
251
- pian p ian
252
- piao p iao
253
- pie p ie
254
- pin p in
255
- ping p ing
256
- po p o
257
- pou p ou
258
- pu p u
259
- qi q i
260
- qia q ia
261
- qian q ian
262
- qiang q iang
263
- qiao q iao
264
- qie q ie
265
- qin q in
266
- qing q ing
267
- qiong q iong
268
- qiu q iu
269
- qu q v
270
- qv q v
271
- quan q van
272
- qvan q van
273
- que q ve
274
- qve q ve
275
- qun q vn
276
- qvn q vn
277
- ran r an
278
- rang r ang
279
- rao r ao
280
- re r e
281
- ren r en
282
- reng r eng
283
- ri r ir
284
- rong r ong
285
- rou r ou
286
- ru r u
287
- rua r ua
288
- ruan r uan
289
- rui r ui
290
- run r un
291
- ruo r uo
292
- sa s a
293
- sai s ai
294
- san s an
295
- sang s ang
296
- sao s ao
297
- se s e
298
- sen s en
299
- seng s eng
300
- sha sh a
301
- shai sh ai
302
- shan sh an
303
- shang sh ang
304
- shao sh ao
305
- she sh e
306
- shei sh ei
307
- shen sh en
308
- sheng sh eng
309
- shi sh ir
310
- shou sh ou
311
- shu sh u
312
- shua sh ua
313
- shuai sh uai
314
- shuan sh uan
315
- shuang sh uang
316
- shui sh ui
317
- shun sh un
318
- shuo sh uo
319
- si s i0
320
- song s ong
321
- sou s ou
322
- su s u
323
- suan s uan
324
- sui s ui
325
- sun s un
326
- suo s uo
327
- ta t a
328
- tai t ai
329
- tan t an
330
- tang t ang
331
- tao t ao
332
- te t e
333
- tei t ei
334
- teng t eng
335
- ti t i
336
- tian t ian
337
- tiao t iao
338
- tie t ie
339
- ting t ing
340
- tong t ong
341
- tou t ou
342
- tu t u
343
- tuan t uan
344
- tui t ui
345
- tun t un
346
- tuo t uo
347
- wa w a
348
- wai w ai
349
- wan w an
350
- wang w ang
351
- wei w ei
352
- wen w en
353
- weng w eng
354
- wo w o
355
- wu w u
356
- xi x i
357
- xia x ia
358
- xian x ian
359
- xiang x iang
360
- xiao x iao
361
- xie x ie
362
- xin x in
363
- xing x ing
364
- xiong x iong
365
- xiu x iu
366
- xu x v
367
- xv x v
368
- xuan x van
369
- xvan x van
370
- xue x ve
371
- xve x ve
372
- xun x vn
373
- xvn x vn
374
- ya y a
375
- yan y En
376
- yang y ang
377
- yao y ao
378
- ye y E
379
- yi y i
380
- yin y in
381
- ying y ing
382
- yo y o
383
- yong y ong
384
- you y ou
385
- yu y v
386
- yv y v
387
- yuan y van
388
- yvan y van
389
- yue y ve
390
- yve y ve
391
- yun y vn
392
- yvn y vn
393
- za z a
394
- zai z ai
395
- zan z an
396
- zang z ang
397
- zao z ao
398
- ze z e
399
- zei z ei
400
- zen z en
401
- zeng z eng
402
- zha zh a
403
- zhai zh ai
404
- zhan zh an
405
- zhang zh ang
406
- zhao zh ao
407
- zhe zh e
408
- zhei zh ei
409
- zhen zh en
410
- zheng zh eng
411
- zhi zh ir
412
- zhong zh ong
413
- zhou zh ou
414
- zhu zh u
415
- zhua zh ua
416
- zhuai zh uai
417
- zhuan zh uan
418
- zhuang zh uang
419
- zhui zh ui
420
- zhun zh un
421
- zhuo zh uo
422
- zi z i0
423
- zong z ong
424
- zou z ou
425
- zu z u
426
- zuan z uan
427
- zui z ui
428
- zun z un
429
- zuo z uo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text/symbols.py DELETED
@@ -1,397 +0,0 @@
1
- import os
2
-
3
- # punctuation = ['!', '?', '…', ",", ".","@"]#@是SP停顿
4
- punctuation = ["!", "?", "…", ",", "."] # @是SP停顿
5
- punctuation.append("-")
6
- pu_symbols = punctuation + ["SP", "SP2", "SP3", "UNK"]
7
- # pu_symbols = punctuation + ["SP", 'SP2', 'SP3','SP4', "UNK"]
8
- pad = "_"
9
-
10
- c = [
11
- "AA",
12
- "EE",
13
- "OO",
14
- "b",
15
- "c",
16
- "ch",
17
- "d",
18
- "f",
19
- "g",
20
- "h",
21
- "j",
22
- "k",
23
- "l",
24
- "m",
25
- "n",
26
- "p",
27
- "q",
28
- "r",
29
- "s",
30
- "sh",
31
- "t",
32
- "w",
33
- "x",
34
- "y",
35
- "z",
36
- "zh",
37
- ]
38
- v = [
39
- "E1",
40
- "En1",
41
- "a1",
42
- "ai1",
43
- "an1",
44
- "ang1",
45
- "ao1",
46
- "e1",
47
- "ei1",
48
- "en1",
49
- "eng1",
50
- "er1",
51
- "i1",
52
- "i01",
53
- "ia1",
54
- "ian1",
55
- "iang1",
56
- "iao1",
57
- "ie1",
58
- "in1",
59
- "ing1",
60
- "iong1",
61
- "ir1",
62
- "iu1",
63
- "o1",
64
- "ong1",
65
- "ou1",
66
- "u1",
67
- "ua1",
68
- "uai1",
69
- "uan1",
70
- "uang1",
71
- "ui1",
72
- "un1",
73
- "uo1",
74
- "v1",
75
- "van1",
76
- "ve1",
77
- "vn1",
78
- "E2",
79
- "En2",
80
- "a2",
81
- "ai2",
82
- "an2",
83
- "ang2",
84
- "ao2",
85
- "e2",
86
- "ei2",
87
- "en2",
88
- "eng2",
89
- "er2",
90
- "i2",
91
- "i02",
92
- "ia2",
93
- "ian2",
94
- "iang2",
95
- "iao2",
96
- "ie2",
97
- "in2",
98
- "ing2",
99
- "iong2",
100
- "ir2",
101
- "iu2",
102
- "o2",
103
- "ong2",
104
- "ou2",
105
- "u2",
106
- "ua2",
107
- "uai2",
108
- "uan2",
109
- "uang2",
110
- "ui2",
111
- "un2",
112
- "uo2",
113
- "v2",
114
- "van2",
115
- "ve2",
116
- "vn2",
117
- "E3",
118
- "En3",
119
- "a3",
120
- "ai3",
121
- "an3",
122
- "ang3",
123
- "ao3",
124
- "e3",
125
- "ei3",
126
- "en3",
127
- "eng3",
128
- "er3",
129
- "i3",
130
- "i03",
131
- "ia3",
132
- "ian3",
133
- "iang3",
134
- "iao3",
135
- "ie3",
136
- "in3",
137
- "ing3",
138
- "iong3",
139
- "ir3",
140
- "iu3",
141
- "o3",
142
- "ong3",
143
- "ou3",
144
- "u3",
145
- "ua3",
146
- "uai3",
147
- "uan3",
148
- "uang3",
149
- "ui3",
150
- "un3",
151
- "uo3",
152
- "v3",
153
- "van3",
154
- "ve3",
155
- "vn3",
156
- "E4",
157
- "En4",
158
- "a4",
159
- "ai4",
160
- "an4",
161
- "ang4",
162
- "ao4",
163
- "e4",
164
- "ei4",
165
- "en4",
166
- "eng4",
167
- "er4",
168
- "i4",
169
- "i04",
170
- "ia4",
171
- "ian4",
172
- "iang4",
173
- "iao4",
174
- "ie4",
175
- "in4",
176
- "ing4",
177
- "iong4",
178
- "ir4",
179
- "iu4",
180
- "o4",
181
- "ong4",
182
- "ou4",
183
- "u4",
184
- "ua4",
185
- "uai4",
186
- "uan4",
187
- "uang4",
188
- "ui4",
189
- "un4",
190
- "uo4",
191
- "v4",
192
- "van4",
193
- "ve4",
194
- "vn4",
195
- "E5",
196
- "En5",
197
- "a5",
198
- "ai5",
199
- "an5",
200
- "ang5",
201
- "ao5",
202
- "e5",
203
- "ei5",
204
- "en5",
205
- "eng5",
206
- "er5",
207
- "i5",
208
- "i05",
209
- "ia5",
210
- "ian5",
211
- "iang5",
212
- "iao5",
213
- "ie5",
214
- "in5",
215
- "ing5",
216
- "iong5",
217
- "ir5",
218
- "iu5",
219
- "o5",
220
- "ong5",
221
- "ou5",
222
- "u5",
223
- "ua5",
224
- "uai5",
225
- "uan5",
226
- "uang5",
227
- "ui5",
228
- "un5",
229
- "uo5",
230
- "v5",
231
- "van5",
232
- "ve5",
233
- "vn5",
234
- ]
235
-
236
- v_without_tone = [
237
- "E",
238
- "En",
239
- "a",
240
- "ai",
241
- "an",
242
- "ang",
243
- "ao",
244
- "e",
245
- "ei",
246
- "en",
247
- "eng",
248
- "er",
249
- "i",
250
- "i0",
251
- "ia",
252
- "ian",
253
- "iang",
254
- "iao",
255
- "ie",
256
- "in",
257
- "ing",
258
- "iong",
259
- "ir",
260
- "iu",
261
- "o",
262
- "ong",
263
- "ou",
264
- "u",
265
- "ua",
266
- "uai",
267
- "uan",
268
- "uang",
269
- "ui",
270
- "un",
271
- "uo",
272
- "v",
273
- "van",
274
- "ve",
275
- "vn",
276
- ]
277
-
278
- # japanese
279
- ja_symbols = [
280
- "I",
281
- "N",
282
- "U",
283
- "a",
284
- "b",
285
- "by",
286
- "ch",
287
- "cl",
288
- "d",
289
- "dy",
290
- "e",
291
- "f",
292
- "g",
293
- "gy",
294
- "h",
295
- "hy",
296
- "i",
297
- "j",
298
- "k",
299
- "ky",
300
- "m",
301
- "my",
302
- "n",
303
- "ny",
304
- "o",
305
- "p",
306
- "py",
307
- "r",
308
- "ry",
309
- "s",
310
- "sh",
311
- "t",
312
- "ts",
313
- "u",
314
- "v",
315
- "w",
316
- "y",
317
- "z",
318
- ]
319
-
320
- arpa = {
321
- "AH0",
322
- "S",
323
- "AH1",
324
- "EY2",
325
- "AE2",
326
- "EH0",
327
- "OW2",
328
- "UH0",
329
- "NG",
330
- "B",
331
- "G",
332
- "AY0",
333
- "M",
334
- "AA0",
335
- "F",
336
- "AO0",
337
- "ER2",
338
- "UH1",
339
- "IY1",
340
- "AH2",
341
- "DH",
342
- "IY0",
343
- "EY1",
344
- "IH0",
345
- "K",
346
- "N",
347
- "W",
348
- "IY2",
349
- "T",
350
- "AA1",
351
- "ER1",
352
- "EH2",
353
- "OY0",
354
- "UH2",
355
- "UW1",
356
- "Z",
357
- "AW2",
358
- "AW1",
359
- "V",
360
- "UW2",
361
- "AA2",
362
- "ER",
363
- "AW0",
364
- "UW0",
365
- "R",
366
- "OW1",
367
- "EH1",
368
- "ZH",
369
- "AE0",
370
- "IH2",
371
- "IH",
372
- "Y",
373
- "JH",
374
- "P",
375
- "AY1",
376
- "EY0",
377
- "OY2",
378
- "TH",
379
- "HH",
380
- "D",
381
- "ER0",
382
- "CH",
383
- "AO1",
384
- "AE1",
385
- "AO2",
386
- "OY1",
387
- "AY2",
388
- "IH1",
389
- "OW0",
390
- "L",
391
- "SH",
392
- }
393
-
394
- symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa)
395
- symbols = sorted(set(symbols))
396
- if __name__ == "__main__":
397
- print(len(symbols))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text/tone_sandhi.py DELETED
@@ -1,805 +0,0 @@
1
- # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
- from typing import List
15
- from typing import Tuple
16
-
17
- import jieba_fast as jieba
18
- from pypinyin import lazy_pinyin
19
- from pypinyin import Style
20
-
21
-
22
- class ToneSandhi:
23
- def __init__(self):
24
- self.must_neural_tone_words = {
25
- "麻烦",
26
- "麻利",
27
- "鸳鸯",
28
- "高粱",
29
- "骨头",
30
- "骆驼",
31
- "马虎",
32
- "首饰",
33
- "馒头",
34
- "馄饨",
35
- "风筝",
36
- "难为",
37
- "队伍",
38
- "阔气",
39
- "闺女",
40
- "门道",
41
- "锄头",
42
- "铺盖",
43
- "铃铛",
44
- "铁匠",
45
- "钥匙",
46
- "里脊",
47
- "里头",
48
- "部分",
49
- "那么",
50
- "道士",
51
- "造化",
52
- "迷糊",
53
- "连累",
54
- "这么",
55
- "这个",
56
- "运气",
57
- "过去",
58
- "软和",
59
- "转悠",
60
- "踏实",
61
- "跳蚤",
62
- "跟头",
63
- "趔趄",
64
- "财主",
65
- "豆腐",
66
- "讲究",
67
- "记性",
68
- "记号",
69
- "认识",
70
- "规矩",
71
- "见识",
72
- "裁缝",
73
- "补丁",
74
- "衣裳",
75
- "衣服",
76
- "衙门",
77
- "街坊",
78
- "行李",
79
- "行当",
80
- "蛤蟆",
81
- "蘑菇",
82
- "薄荷",
83
- "葫芦",
84
- "葡萄",
85
- "萝卜",
86
- "荸荠",
87
- "苗条",
88
- "苗头",
89
- "苍蝇",
90
- "芝麻",
91
- "舒服",
92
- "舒坦",
93
- "舌头",
94
- "自在",
95
- "膏药",
96
- "脾气",
97
- "脑袋",
98
- "脊梁",
99
- "能耐",
100
- "胳膊",
101
- "胭脂",
102
- "胡萝",
103
- "胡琴",
104
- "胡同",
105
- "聪明",
106
- "耽误",
107
- "耽搁",
108
- "耷拉",
109
- "耳朵",
110
- "老爷",
111
- "老实",
112
- "老婆",
113
- "老头",
114
- "老太",
115
- "翻腾",
116
- "罗嗦",
117
- "罐头",
118
- "编辑",
119
- "结实",
120
- "红火",
121
- "累赘",
122
- "糨糊",
123
- "糊涂",
124
- "精神",
125
- "粮食",
126
- "簸箕",
127
- "篱笆",
128
- "算计",
129
- "算盘",
130
- "答应",
131
- "笤帚",
132
- "笑语",
133
- "笑话",
134
- "窟窿",
135
- "窝囊",
136
- "窗户",
137
- "稳当",
138
- "稀罕",
139
- "称呼",
140
- "秧歌",
141
- "秀气",
142
- "秀才",
143
- "福气",
144
- "祖宗",
145
- "砚台",
146
- "码头",
147
- "石榴",
148
- "石头",
149
- "石匠",
150
- "知识",
151
- "眼睛",
152
- "眯缝",
153
- "眨巴",
154
- "眉毛",
155
- "相声",
156
- "盘算",
157
- "白净",
158
- "痢疾",
159
- "痛快",
160
- "疟疾",
161
- "疙瘩",
162
- "疏忽",
163
- "畜生",
164
- "生意",
165
- "甘蔗",
166
- "琵琶",
167
- "琢磨",
168
- "琉璃",
169
- "玻璃",
170
- "玫瑰",
171
- "玄乎",
172
- "狐狸",
173
- "状元",
174
- "特务",
175
- "牲口",
176
- "牙碜",
177
- "牌楼",
178
- "爽快",
179
- "爱人",
180
- "热闹",
181
- "烧饼",
182
- "烟筒",
183
- "烂糊",
184
- "点心",
185
- "炊帚",
186
- "灯笼",
187
- "火候",
188
- "漂亮",
189
- "滑溜",
190
- "溜达",
191
- "温和",
192
- "清楚",
193
- "消息",
194
- "浪头",
195
- "活泼",
196
- "比方",
197
- "正经",
198
- "欺负",
199
- "模糊",
200
- "槟榔",
201
- "棺材",
202
- "棒槌",
203
- "棉花",
204
- "核桃",
205
- "栅栏",
206
- "柴火",
207
- "架势",
208
- "枕头",
209
- "���杷",
210
- "机灵",
211
- "本事",
212
- "木头",
213
- "木匠",
214
- "朋友",
215
- "月饼",
216
- "月亮",
217
- "暖和",
218
- "明白",
219
- "时候",
220
- "新鲜",
221
- "故事",
222
- "收拾",
223
- "收成",
224
- "提防",
225
- "挖苦",
226
- "挑剔",
227
- "指甲",
228
- "指头",
229
- "拾掇",
230
- "拳头",
231
- "拨弄",
232
- "招牌",
233
- "招呼",
234
- "抬举",
235
- "护士",
236
- "折腾",
237
- "扫帚",
238
- "打量",
239
- "打算",
240
- "打点",
241
- "打扮",
242
- "打听",
243
- "打发",
244
- "扎实",
245
- "扁担",
246
- "戒指",
247
- "懒得",
248
- "意识",
249
- "意思",
250
- "情形",
251
- "悟性",
252
- "怪物",
253
- "思量",
254
- "怎么",
255
- "念头",
256
- "念叨",
257
- "快活",
258
- "忙活",
259
- "志气",
260
- "心思",
261
- "得罪",
262
- "张罗",
263
- "弟兄",
264
- "开通",
265
- "应酬",
266
- "庄稼",
267
- "干事",
268
- "帮手",
269
- "帐篷",
270
- "希罕",
271
- "师父",
272
- "师傅",
273
- "巴结",
274
- "巴掌",
275
- "差事",
276
- "工夫",
277
- "岁数",
278
- "屁股",
279
- "尾巴",
280
- "少爷",
281
- "小气",
282
- "小伙",
283
- "将就",
284
- "对头",
285
- "对付",
286
- "寡妇",
287
- "家伙",
288
- "客气",
289
- "实在",
290
- "官司",
291
- "学问",
292
- "学生",
293
- "字号",
294
- "嫁妆",
295
- "媳妇",
296
- "媒人",
297
- "婆家",
298
- "娘家",
299
- "委屈",
300
- "姑娘",
301
- "姐夫",
302
- "妯娌",
303
- "妥当",
304
- "妖精",
305
- "奴才",
306
- "女婿",
307
- "头发",
308
- "太阳",
309
- "大爷",
310
- "大方",
311
- "大意",
312
- "大夫",
313
- "多少",
314
- "多么",
315
- "外甥",
316
- "壮实",
317
- "地道",
318
- "地方",
319
- "在乎",
320
- "困难",
321
- "嘴巴",
322
- "嘱咐",
323
- "嘟囔",
324
- "嘀咕",
325
- "喜欢",
326
- "喇嘛",
327
- "喇叭",
328
- "商量",
329
- "唾沫",
330
- "哑巴",
331
- "哈欠",
332
- "哆嗦",
333
- "咳嗽",
334
- "和尚",
335
- "告诉",
336
- "告示",
337
- "含糊",
338
- "吓唬",
339
- "后头",
340
- "名字",
341
- "名堂",
342
- "合同",
343
- "吆喝",
344
- "叫唤",
345
- "口袋",
346
- "厚道",
347
- "厉害",
348
- "千斤",
349
- "包袱",
350
- "包涵",
351
- "匀称",
352
- "勤快",
353
- "动静",
354
- "动弹",
355
- "功夫",
356
- "力气",
357
- "前头",
358
- "刺猬",
359
- "刺激",
360
- "别扭",
361
- "利落",
362
- "利索",
363
- "利害",
364
- "分析",
365
- "出息",
366
- "凑合",
367
- "凉快",
368
- "冷战",
369
- "冤枉",
370
- "冒失",
371
- "养活",
372
- "关系",
373
- "先生",
374
- "兄弟",
375
- "便宜",
376
- "使唤",
377
- "佩服",
378
- "作坊",
379
- "体面",
380
- "位置",
381
- "似的",
382
- "伙计",
383
- "休息",
384
- "什么",
385
- "人家",
386
- "亲戚",
387
- "亲家",
388
- "交情",
389
- "云彩",
390
- "事情",
391
- "买卖",
392
- "主意",
393
- "丫头",
394
- "丧气",
395
- "两口",
396
- "东西",
397
- "东家",
398
- "世故",
399
- "不由",
400
- "不在",
401
- "下水",
402
- "下巴",
403
- "上头",
404
- "上司",
405
- "丈夫",
406
- "丈人",
407
- "一辈",
408
- "那个",
409
- "菩萨",
410
- "父亲",
411
- "母亲",
412
- "咕噜",
413
- "邋遢",
414
- "费用",
415
- "冤家",
416
- "甜头",
417
- "介绍",
418
- "荒唐",
419
- "大人",
420
- "泥鳅",
421
- "幸福",
422
- "熟悉",
423
- "计划",
424
- "扑腾",
425
- "蜡烛",
426
- "姥爷",
427
- "照顾",
428
- "喉咙",
429
- "吉他",
430
- "弄堂",
431
- "蚂蚱",
432
- "凤凰",
433
- "拖沓",
434
- "寒碜",
435
- "糟蹋",
436
- "倒腾",
437
- "报复",
438
- "逻辑",
439
- "盘缠",
440
- "喽啰",
441
- "牢骚",
442
- "咖喱",
443
- "扫把",
444
- "惦记",
445
- }
446
- self.must_not_neural_tone_words = {
447
- "男子",
448
- "女子",
449
- "分子",
450
- "原子",
451
- "量子",
452
- "莲子",
453
- "石子",
454
- "瓜子",
455
- "电子",
456
- "人人",
457
- "虎虎",
458
- "幺幺",
459
- "干嘛",
460
- "学子",
461
- "哈哈",
462
- "数数",
463
- "袅袅",
464
- "局地",
465
- "以下",
466
- "娃哈哈",
467
- "花花草草",
468
- "留得",
469
- "耕地",
470
- "想想",
471
- "熙熙",
472
- "攘攘",
473
- "卵子",
474
- "死死",
475
- "冉冉",
476
- "恳恳",
477
- "佼佼",
478
- "吵吵",
479
- "打打",
480
- "考考",
481
- "整整",
482
- "莘莘",
483
- "落地",
484
- "算子",
485
- "家家户户",
486
- "青青",
487
- }
488
- self.punc = ":,;。?!“”‘’':,;.?!"
489
-
490
- # the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
491
- # e.g.
492
- # word: "家里"
493
- # pos: "s"
494
- # finals: ['ia1', 'i3']
495
- def _neural_sandhi(self, word: str, pos: str, finals: List[str]) -> List[str]:
496
- # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
497
- for j, item in enumerate(word):
498
- if (
499
- j - 1 >= 0
500
- and item == word[j - 1]
501
- and pos[0] in {"n", "v", "a"}
502
- and word not in self.must_not_neural_tone_words
503
- ):
504
- finals[j] = finals[j][:-1] + "5"
505
- ge_idx = word.find("个")
506
- if len(word) >= 1 and word[-1] in "吧呢哈啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶":
507
- finals[-1] = finals[-1][:-1] + "5"
508
- elif len(word) >= 1 and word[-1] in "的地得":
509
- finals[-1] = finals[-1][:-1] + "5"
510
- # e.g. 走了, 看着, 去过
511
- elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}:
512
- finals[-1] = finals[-1][:-1] + "5"
513
- elif (
514
- len(word) > 1
515
- and word[-1] in "们子"
516
- and pos in {"r", "n"}
517
- and word not in self.must_not_neural_tone_words
518
- ):
519
- finals[-1] = finals[-1][:-1] + "5"
520
- # e.g. 桌上, 地下, 家里
521
- elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}:
522
- finals[-1] = finals[-1][:-1] + "5"
523
- # e.g. 上来, 下去
524
- elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开":
525
- finals[-1] = finals[-1][:-1] + "5"
526
- # 个做量词
527
- elif (
528
- ge_idx >= 1
529
- and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是")
530
- ) or word == "个":
531
- finals[ge_idx] = finals[ge_idx][:-1] + "5"
532
- else:
533
- if (
534
- word in self.must_neural_tone_words
535
- or word[-2:] in self.must_neural_tone_words
536
- ):
537
- finals[-1] = finals[-1][:-1] + "5"
538
-
539
- word_list = self._split_word(word)
540
- finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
541
- for i, word in enumerate(word_list):
542
- # conventional neural in Chinese
543
- if (
544
- word in self.must_neural_tone_words
545
- or word[-2:] in self.must_neural_tone_words
546
- ):
547
- finals_list[i][-1] = finals_list[i][-1][:-1] + "5"
548
- finals = sum(finals_list, [])
549
- return finals
550
-
551
- def _bu_sandhi(self, word: str, finals: List[str]) -> List[str]:
552
- # e.g. 看不懂
553
- if len(word) == 3 and word[1] == "不":
554
- finals[1] = finals[1][:-1] + "5"
555
- else:
556
- for i, char in enumerate(word):
557
- # "不" before tone4 should be bu2, e.g. 不怕
558
- if char == "不" and i + 1 < len(word) and finals[i + 1][-1] == "4":
559
- finals[i] = finals[i][:-1] + "2"
560
- return finals
561
-
562
- def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]:
563
- # "一" in number sequences, e.g. 一零零, 二一零
564
- if word.find("一") != -1 and all(
565
- [item.isnumeric() for item in word if item != "一"]
566
- ):
567
- return finals
568
- # "一" between reduplication words shold be yi5, e.g. 看一看
569
- elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]:
570
- finals[1] = finals[1][:-1] + "5"
571
- # when "一" is ordinal word, it should be yi1
572
- elif word.startswith("第一"):
573
- finals[1] = finals[1][:-1] + "1"
574
- else:
575
- for i, char in enumerate(word):
576
- if char == "一" and i + 1 < len(word):
577
- # "一" before tone4 should be yi2, e.g. 一段
578
- if finals[i + 1][-1] == "4":
579
- finals[i] = finals[i][:-1] + "2"
580
- # "一" before non-tone4 should be yi4, e.g. 一天
581
- else:
582
- # "一" 后面如果是标点,还读一声
583
- if word[i + 1] not in self.punc:
584
- finals[i] = finals[i][:-1] + "4"
585
- return finals
586
-
587
- def _split_word(self, word: str) -> List[str]:
588
- word_list = jieba.cut_for_search(word)
589
- word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
590
- first_subword = word_list[0]
591
- first_begin_idx = word.find(first_subword)
592
- if first_begin_idx == 0:
593
- second_subword = word[len(first_subword) :]
594
- new_word_list = [first_subword, second_subword]
595
- else:
596
- second_subword = word[: -len(first_subword)]
597
- new_word_list = [second_subword, first_subword]
598
- return new_word_list
599
-
600
- def _three_sandhi(self, word: str, finals: List[str]) -> List[str]:
601
- if len(word) == 2 and self._all_tone_three(finals):
602
- finals[0] = finals[0][:-1] + "2"
603
- elif len(word) == 3:
604
- word_list = self._split_word(word)
605
- if self._all_tone_three(finals):
606
- # disyllabic + monosyllabic, e.g. 蒙古/包
607
- if len(word_list[0]) == 2:
608
- finals[0] = finals[0][:-1] + "2"
609
- finals[1] = finals[1][:-1] + "2"
610
- # monosyllabic + disyllabic, e.g. 纸/老虎
611
- elif len(word_list[0]) == 1:
612
- finals[1] = finals[1][:-1] + "2"
613
- else:
614
- finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
615
- if len(finals_list) == 2:
616
- for i, sub in enumerate(finals_list):
617
- # e.g. 所有/人
618
- if self._all_tone_three(sub) and len(sub) == 2:
619
- finals_list[i][0] = finals_list[i][0][:-1] + "2"
620
- # e.g. 好/喜欢
621
- elif (
622
- i == 1
623
- and not self._all_tone_three(sub)
624
- and finals_list[i][0][-1] == "3"
625
- and finals_list[0][-1][-1] == "3"
626
- ):
627
- finals_list[0][-1] = finals_list[0][-1][:-1] + "2"
628
- finals = sum(finals_list, [])
629
- # split idiom into two words who's length is 2
630
- elif len(word) == 4:
631
- finals_list = [finals[:2], finals[2:]]
632
- finals = []
633
- for sub in finals_list:
634
- if self._all_tone_three(sub):
635
- sub[0] = sub[0][:-1] + "2"
636
- finals += sub
637
-
638
- return finals
639
-
640
- def _all_tone_three(self, finals: List[str]) -> bool:
641
- return all(x[-1] == "3" for x in finals)
642
-
643
- # merge "不" and the word behind it
644
- # if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error
645
- def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
646
- new_seg = []
647
- last_word = ""
648
- for word, pos in seg:
649
- if last_word == "不":
650
- word = last_word + word
651
- if word != "不":
652
- new_seg.append((word, pos))
653
- last_word = word[:]
654
- if last_word == "不":
655
- new_seg.append((last_word, "d"))
656
- last_word = ""
657
- return new_seg
658
-
659
- # function 1: merge "一" and reduplication words in it's left and right, e.g. "听","一","听" ->"听一听"
660
- # function 2: merge single "一" and the word behind it
661
- # if don't merge, "一" sometimes appears alone according to jieba, which may occur sandhi error
662
- # e.g.
663
- # input seg: [('听', 'v'), ('一', 'm'), ('听', 'v')]
664
- # output seg: [['听一听', 'v']]
665
- def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
666
- new_seg = []
667
- # function 1
668
- for i, (word, pos) in enumerate(seg):
669
- if (
670
- i - 1 >= 0
671
- and word == "一"
672
- and i + 1 < len(seg)
673
- and seg[i - 1][0] == seg[i + 1][0]
674
- and seg[i - 1][1] == "v"
675
- ):
676
- new_seg[i - 1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0]
677
- else:
678
- if (
679
- i - 2 >= 0
680
- and seg[i - 1][0] == "一"
681
- and seg[i - 2][0] == word
682
- and pos == "v"
683
- ):
684
- continue
685
- else:
686
- new_seg.append([word, pos])
687
- seg = new_seg
688
- new_seg = []
689
- # function 2
690
- for i, (word, pos) in enumerate(seg):
691
- if new_seg and new_seg[-1][0] == "一":
692
- new_seg[-1][0] = new_seg[-1][0] + word
693
- else:
694
- new_seg.append([word, pos])
695
- return new_seg
696
-
697
- # the first and the second words are all_tone_three
698
- def _merge_continuous_three_tones(
699
- self, seg: List[Tuple[str, str]]
700
- ) -> List[Tuple[str, str]]:
701
- new_seg = []
702
- sub_finals_list = [
703
- lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
704
- for (word, pos) in seg
705
- ]
706
- assert len(sub_finals_list) == len(seg)
707
- merge_last = [False] * len(seg)
708
- for i, (word, pos) in enumerate(seg):
709
- if (
710
- i - 1 >= 0
711
- and self._all_tone_three(sub_finals_list[i - 1])
712
- and self._all_tone_three(sub_finals_list[i])
713
- and not merge_last[i - 1]
714
- ):
715
- # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
716
- if (
717
- not self._is_reduplication(seg[i - 1][0])
718
- and len(seg[i - 1][0]) + len(seg[i][0]) <= 3
719
- ):
720
- new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
721
- merge_last[i] = True
722
- else:
723
- new_seg.append([word, pos])
724
- else:
725
- new_seg.append([word, pos])
726
-
727
- return new_seg
728
-
729
- def _is_reduplication(self, word: str) -> bool:
730
- return len(word) == 2 and word[0] == word[1]
731
-
732
- # the last char of first word and the first char of second word is tone_three
733
- def _merge_continuous_three_tones_2(
734
- self, seg: List[Tuple[str, str]]
735
- ) -> List[Tuple[str, str]]:
736
- new_seg = []
737
- sub_finals_list = [
738
- lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
739
- for (word, pos) in seg
740
- ]
741
- assert len(sub_finals_list) == len(seg)
742
- merge_last = [False] * len(seg)
743
- for i, (word, pos) in enumerate(seg):
744
- if (
745
- i - 1 >= 0
746
- and sub_finals_list[i - 1][-1][-1] == "3"
747
- and sub_finals_list[i][0][-1] == "3"
748
- and not merge_last[i - 1]
749
- ):
750
- # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
751
- if (
752
- not self._is_reduplication(seg[i - 1][0])
753
- and len(seg[i - 1][0]) + len(seg[i][0]) <= 3
754
- ):
755
- new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
756
- merge_last[i] = True
757
- else:
758
- new_seg.append([word, pos])
759
- else:
760
- new_seg.append([word, pos])
761
- return new_seg
762
-
763
- def _merge_er(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
764
- new_seg = []
765
- for i, (word, pos) in enumerate(seg):
766
- if i - 1 >= 0 and word == "儿" and seg[i - 1][0] != "#":
767
- new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
768
- else:
769
- new_seg.append([word, pos])
770
- return new_seg
771
-
772
- def _merge_reduplication(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
773
- new_seg = []
774
- for i, (word, pos) in enumerate(seg):
775
- if new_seg and word == new_seg[-1][0]:
776
- new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
777
- else:
778
- new_seg.append([word, pos])
779
- return new_seg
780
-
781
- def pre_merge_for_modify(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
782
- seg = self._merge_bu(seg)
783
- try:
784
- seg = self._merge_yi(seg)
785
- except:
786
- print("_merge_yi failed")
787
- seg = self._merge_reduplication(seg)
788
- try:
789
- seg = self._merge_continuous_three_tones(seg)
790
- except:
791
- print("_merge_continuous_three_tones failed")
792
- try:
793
- seg = self._merge_continuous_three_tones_2(seg)
794
- except:
795
- print("_merge_continuous_three_tones_2 failed")
796
-
797
- seg = self._merge_er(seg)
798
- return seg
799
-
800
- def modified_tone(self, word: str, pos: str, finals: List[str]) -> List[str]:
801
- finals = self._bu_sandhi(word, finals)
802
- finals = self._yi_sandhi(word, finals)
803
- finals = self._neural_sandhi(word, pos, finals)
804
- finals = self._three_sandhi(word, finals)
805
- return finals