AkitoP commited on
Commit
2372084
1 Parent(s): adf6347

Update GPT_SoVITS/text/japanese.py

Browse files
Files changed (1) hide show
  1. GPT_SoVITS/text/japanese.py +223 -227
GPT_SoVITS/text/japanese.py CHANGED
@@ -1,227 +1,223 @@
1
- # modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py
2
- import re
3
-
4
- import pyopenjtalk
5
- import os
6
- import hashlib
7
- from text.symbols2 import symbols
8
- current_file_path = os.path.dirname(__file__)
9
- def get_hash(fp: str) -> str:
10
- hash_md5 = hashlib.md5()
11
- with open(fp, "rb") as f:
12
- for chunk in iter(lambda: f.read(4096), b""):
13
- hash_md5.update(chunk)
14
- return hash_md5.hexdigest()
15
-
16
- USERDIC_CSV_PATH = os.path.join(current_file_path, "ja_userdic", "userdict.csv")
17
- USERDIC_BIN_PATH = os.path.join(current_file_path, "ja_userdic", "user.dict")
18
- USERDIC_HASH_PATH = os.path.join(current_file_path, "ja_userdic", "userdict.md5")
19
- # 如果没有用户词典,就生成一个;如果有,就检查md5,如果不一样,就重新生成
20
- if os.path.exists(USERDIC_CSV_PATH):
21
- print("userdict exists")
22
- if not os.path.exists(USERDIC_BIN_PATH) or get_hash(USERDIC_CSV_PATH) != open(USERDIC_HASH_PATH, "r",encoding='utf-8').read():
23
- pyopenjtalk.mecab_dict_index(USERDIC_CSV_PATH, USERDIC_BIN_PATH)
24
- with open(USERDIC_HASH_PATH, "w", encoding='utf-8') as f:
25
- f.write(get_hash(USERDIC_CSV_PATH))
26
-
27
- if os.path.exists(USERDIC_BIN_PATH):
28
- pyopenjtalk.update_global_jtalk_with_user_dict(USERDIC_BIN_PATH)
29
-
30
-
31
- from text.symbols import punctuation
32
- # Regular expression matching Japanese without punctuation marks:
33
- _japanese_characters = re.compile(
34
- r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
35
- )
36
-
37
- # Regular expression matching non-Japanese characters or punctuation marks:
38
- _japanese_marks = re.compile(
39
- r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
40
- )
41
-
42
- # List of (symbol, Japanese) pairs for marks:
43
- _symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("%", "パーセント")]]
44
-
45
-
46
- # List of (consonant, sokuon) pairs:
47
- _real_sokuon = [
48
- (re.compile("%s" % x[0]), x[1])
49
- for x in [
50
- (r"Q([↑↓]*[kg])", r"k#\1"),
51
- (r"Q([↑↓]*[tdjʧ])", r"t#\1"),
52
- (r"Q([↑↓]*[sʃ])", r"s\1"),
53
- (r"Q([↑↓]*[pb])", r"p#\1"),
54
- ]
55
- ]
56
-
57
- # List of (consonant, hatsuon) pairs:
58
- _real_hatsuon = [
59
- (re.compile("%s" % x[0]), x[1])
60
- for x in [
61
- (r"N([↑↓]*[pbm])", r"m\1"),
62
- (r"N([↑↓]*[ʧʥj])", r"n^\1"),
63
- (r"N([↑↓]*[tdn])", r"n\1"),
64
- (r"N([↑↓]*[kg])", r"ŋ\1"),
65
- ]
66
- ]
67
-
68
-
69
- def post_replace_ph(ph):
70
- rep_map = {
71
- "": ",",
72
- "": ",",
73
- "": ",",
74
- "": ".",
75
- "": "!",
76
- "": "?",
77
- "\n": ".",
78
- "·": ",",
79
- "、": ",",
80
- "...": "…",
81
- }
82
-
83
- if ph in rep_map.keys():
84
- ph = rep_map[ph]
85
- if ph in symbols:
86
- return ph
87
- if ph not in symbols:
88
- ph = "UNK"
89
- return ph
90
-
91
-
92
- def replace_consecutive_punctuation(text):
93
- punctuations = ''.join(re.escape(p) for p in punctuation)
94
- pattern = f'([{punctuations}])([{punctuations}])+'
95
- result = re.sub(pattern, r'\1', text)
96
- return result
97
-
98
-
99
- def symbols_to_japanese(text):
100
- for regex, replacement in _symbols_to_japanese:
101
- text = re.sub(regex, replacement, text)
102
- return text
103
-
104
-
105
- def preprocess_jap(text, with_prosody=False):
106
- """Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html"""
107
- text = symbols_to_japanese(text)
108
- sentences = re.split(_japanese_marks, text)
109
- marks = re.findall(_japanese_marks, text)
110
- text = []
111
- for i, sentence in enumerate(sentences):
112
- if re.match(_japanese_characters, sentence):
113
- if with_prosody:
114
- text += pyopenjtalk_g2p_prosody(sentence)[1:-1]
115
- else:
116
- p = pyopenjtalk.g2p(sentence)
117
- text += p.split(" ")
118
-
119
- if i < len(marks):
120
- if marks[i] == " ":# 防止意外的UNK
121
- continue
122
- text += [marks[i].replace(" ", "")]
123
- return text
124
-
125
-
126
- def text_normalize(text):
127
- # todo: jap text normalize
128
-
129
- # 避免重复标点引起的参考泄露
130
- text = replace_consecutive_punctuation(text)
131
- text = "".join([i.lower() for i in text])
132
- return text
133
-
134
- # Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
135
- def pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True):
136
- """Extract phoneme + prosoody symbol sequence from input full-context labels.
137
-
138
- The algorithm is based on `Prosodic features control by symbols as input of
139
- sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks.
140
-
141
- Args:
142
- text (str): Input text.
143
- drop_unvoiced_vowels (bool): whether to drop unvoiced vowels.
144
-
145
- Returns:
146
- List[str]: List of phoneme + prosody symbols.
147
-
148
- Examples:
149
- >>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody
150
- >>> pyopenjtalk_g2p_prosody("こんにちは。")
151
- ['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$']
152
-
153
- .. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic
154
- modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104
155
-
156
- """
157
- labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text))
158
- N = len(labels)
159
-
160
- phones = []
161
- for n in range(N):
162
- lab_curr = labels[n]
163
-
164
- # current phoneme
165
- p3 = re.search(r"\-(.*?)\+", lab_curr).group(1)
166
- # deal unvoiced vowels as normal vowels
167
- if drop_unvoiced_vowels and p3 in "AEIOU":
168
- p3 = p3.lower()
169
-
170
- # deal with sil at the beginning and the end of text
171
- if p3 == "sil":
172
- assert n == 0 or n == N - 1
173
- if n == 0:
174
- phones.append("^")
175
- elif n == N - 1:
176
- # check question form or not
177
- e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr)
178
- if e3 == 0:
179
- phones.append("$")
180
- elif e3 == 1:
181
- phones.append("?")
182
- continue
183
- elif p3 == "pau":
184
- phones.append("_")
185
- continue
186
- else:
187
- phones.append(p3)
188
-
189
- # accent type and position info (forward or backward)
190
- a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr)
191
- a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr)
192
- a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr)
193
-
194
- # number of mora in accent phrase
195
- f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr)
196
-
197
- a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1])
198
- # accent phrase border
199
- if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl":
200
- phones.append("#")
201
- # pitch falling
202
- elif a1 == 0 and a2_next == a2 + 1 and a2 != f1:
203
- phones.append("]")
204
- # pitch rising
205
- elif a2 == 1 and a2_next == 2:
206
- phones.append("[")
207
-
208
- return phones
209
-
210
- # Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
211
- def _numeric_feature_by_regex(regex, s):
212
- match = re.search(regex, s)
213
- if match is None:
214
- return -50
215
- return int(match.group(1))
216
-
217
- def g2p(norm_text, with_prosody=True):
218
- norm_text = text_normalize(norm_text)
219
- phones = preprocess_jap(norm_text, with_prosody)
220
- phones = [post_replace_ph(i) for i in phones]
221
- # todo: implement tones and word2ph
222
- return phones
223
-
224
-
225
- if __name__ == "__main__":
226
- phones = g2p("こんにちは, hello, AKITOです,よろしくお願いしますね!")
227
- print(phones)
 
1
+ # modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py
2
+ import re
3
+
4
+ import pyopenjtalk
5
+ import os
6
+ import hashlib
7
+ from text.symbols2 import symbols
8
+ current_file_path = os.path.dirname(__file__)
9
+ def get_hash(fp: str) -> str:
10
+ hash_md5 = hashlib.md5()
11
+ with open(fp, "rb") as f:
12
+ for chunk in iter(lambda: f.read(4096), b""):
13
+ hash_md5.update(chunk)
14
+ return hash_md5.hexdigest()
15
+
16
+ USERDIC_CSV_PATH = os.path.join(current_file_path, "ja_userdic", "userdict.csv")
17
+ USERDIC_BIN_PATH = os.path.join(current_file_path, "ja_userdic", "user.dict")
18
+ USERDIC_HASH_PATH = os.path.join(current_file_path, "ja_userdic", "userdict.md5")
19
+ # 如果没有用户词典,就生成一个;如果有,就检查md5,如果不一样,就重新生成
20
+ try:
21
+ if os.path.exists(USERDIC_BIN_PATH):
22
+ pyopenjtalk.update_global_jtalk_with_user_dict(USERDIC_BIN_PATH)
23
+ except:
24
+ print("FAIL TO USE USERDICT")
25
+
26
+
27
+ from text.symbols import punctuation
28
+ # Regular expression matching Japanese without punctuation marks:
29
+ _japanese_characters = re.compile(
30
+ r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
31
+ )
32
+
33
+ # Regular expression matching non-Japanese characters or punctuation marks:
34
+ _japanese_marks = re.compile(
35
+ r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
36
+ )
37
+
38
+ # List of (symbol, Japanese) pairs for marks:
39
+ _symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("%", "パーセント")]]
40
+
41
+
42
+ # List of (consonant, sokuon) pairs:
43
+ _real_sokuon = [
44
+ (re.compile("%s" % x[0]), x[1])
45
+ for x in [
46
+ (r"Q([↑↓]*[kg])", r"k#\1"),
47
+ (r"Q([↑↓]*[tdjʧ])", r"t#\1"),
48
+ (r"Q([↑↓]*[sʃ])", r"s\1"),
49
+ (r"Q([↑↓]*[pb])", r"p#\1"),
50
+ ]
51
+ ]
52
+
53
+ # List of (consonant, hatsuon) pairs:
54
+ _real_hatsuon = [
55
+ (re.compile("%s" % x[0]), x[1])
56
+ for x in [
57
+ (r"N([↑↓]*[pbm])", r"m\1"),
58
+ (r"N([↑↓]*[ʧʥj])", r"n^\1"),
59
+ (r"N([↑↓]*[tdn])", r"n\1"),
60
+ (r"N([↑↓]*[kg])", r"ŋ\1"),
61
+ ]
62
+ ]
63
+
64
+
65
+ def post_replace_ph(ph):
66
+ rep_map = {
67
+ ":": ",",
68
+ ";": ",",
69
+ ",": ",",
70
+ "。": ".",
71
+ "": "!",
72
+ "": "?",
73
+ "\n": ".",
74
+ "·": ",",
75
+ "": ",",
76
+ "...": "",
77
+ }
78
+
79
+ if ph in rep_map.keys():
80
+ ph = rep_map[ph]
81
+ if ph in symbols:
82
+ return ph
83
+ if ph not in symbols:
84
+ ph = "UNK"
85
+ return ph
86
+
87
+
88
+ def replace_consecutive_punctuation(text):
89
+ punctuations = ''.join(re.escape(p) for p in punctuation)
90
+ pattern = f'([{punctuations}])([{punctuations}])+'
91
+ result = re.sub(pattern, r'\1', text)
92
+ return result
93
+
94
+
95
+ def symbols_to_japanese(text):
96
+ for regex, replacement in _symbols_to_japanese:
97
+ text = re.sub(regex, replacement, text)
98
+ return text
99
+
100
+
101
+ def preprocess_jap(text, with_prosody=False):
102
+ """Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html"""
103
+ text = symbols_to_japanese(text)
104
+ sentences = re.split(_japanese_marks, text)
105
+ marks = re.findall(_japanese_marks, text)
106
+ text = []
107
+ for i, sentence in enumerate(sentences):
108
+ if re.match(_japanese_characters, sentence):
109
+ if with_prosody:
110
+ text += pyopenjtalk_g2p_prosody(sentence)[1:-1]
111
+ else:
112
+ p = pyopenjtalk.g2p(sentence)
113
+ text += p.split(" ")
114
+
115
+ if i < len(marks):
116
+ if marks[i] == " ":# 防止意外的UNK
117
+ continue
118
+ text += [marks[i].replace(" ", "")]
119
+ return text
120
+
121
+
122
+ def text_normalize(text):
123
+ # todo: jap text normalize
124
+
125
+ # 避免重复标点引起的参考泄露
126
+ text = replace_consecutive_punctuation(text)
127
+ text = "".join([i.lower() for i in text])
128
+ return text
129
+
130
+ # Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
131
+ def pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True):
132
+ """Extract phoneme + prosoody symbol sequence from input full-context labels.
133
+
134
+ The algorithm is based on `Prosodic features control by symbols as input of
135
+ sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks.
136
+
137
+ Args:
138
+ text (str): Input text.
139
+ drop_unvoiced_vowels (bool): whether to drop unvoiced vowels.
140
+
141
+ Returns:
142
+ List[str]: List of phoneme + prosody symbols.
143
+
144
+ Examples:
145
+ >>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody
146
+ >>> pyopenjtalk_g2p_prosody("こんにちは。")
147
+ ['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$']
148
+
149
+ .. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic
150
+ modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104
151
+
152
+ """
153
+ labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text))
154
+ N = len(labels)
155
+
156
+ phones = []
157
+ for n in range(N):
158
+ lab_curr = labels[n]
159
+
160
+ # current phoneme
161
+ p3 = re.search(r"\-(.*?)\+", lab_curr).group(1)
162
+ # deal unvoiced vowels as normal vowels
163
+ if drop_unvoiced_vowels and p3 in "AEIOU":
164
+ p3 = p3.lower()
165
+
166
+ # deal with sil at the beginning and the end of text
167
+ if p3 == "sil":
168
+ assert n == 0 or n == N - 1
169
+ if n == 0:
170
+ phones.append("^")
171
+ elif n == N - 1:
172
+ # check question form or not
173
+ e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr)
174
+ if e3 == 0:
175
+ phones.append("$")
176
+ elif e3 == 1:
177
+ phones.append("?")
178
+ continue
179
+ elif p3 == "pau":
180
+ phones.append("_")
181
+ continue
182
+ else:
183
+ phones.append(p3)
184
+
185
+ # accent type and position info (forward or backward)
186
+ a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr)
187
+ a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr)
188
+ a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr)
189
+
190
+ # number of mora in accent phrase
191
+ f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr)
192
+
193
+ a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1])
194
+ # accent phrase border
195
+ if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl":
196
+ phones.append("#")
197
+ # pitch falling
198
+ elif a1 == 0 and a2_next == a2 + 1 and a2 != f1:
199
+ phones.append("]")
200
+ # pitch rising
201
+ elif a2 == 1 and a2_next == 2:
202
+ phones.append("[")
203
+
204
+ return phones
205
+
206
+ # Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
207
+ def _numeric_feature_by_regex(regex, s):
208
+ match = re.search(regex, s)
209
+ if match is None:
210
+ return -50
211
+ return int(match.group(1))
212
+
213
+ def g2p(norm_text, with_prosody=True):
214
+ norm_text = text_normalize(norm_text)
215
+ phones = preprocess_jap(norm_text, with_prosody)
216
+ phones = [post_replace_ph(i) for i in phones]
217
+ # todo: implement tones and word2ph
218
+ return phones
219
+
220
+
221
+ if __name__ == "__main__":
222
+ phones = g2p("���んにちは, hello, AKITOです,よろしくお願いしますね!")
223
+ print(phones)