wetdog commited on
Commit
2dd2041
·
1 Parent(s): 5b51c67

multiaccent inference, cleaners and models

Browse files
Files changed (4) hide show
  1. Dockerfile +8 -5
  2. infer_onnx.py +59 -33
  3. spk_to_id_2.json +18 -0
  4. text/cleaners.py +36 -12
Dockerfile CHANGED
@@ -40,11 +40,14 @@ COPY --chown=user requirements.txt $HOME/app/
40
 
41
  RUN pip install -r requirements.txt
42
 
43
- # download from inference script
44
- #RUN huggingface-cli download BSC-LT/matcha-tts-cat-multispeaker matcha_multispeaker_cat_opset_15_10_steps_2399.onnx --local-dir $HOME/app/
45
- #RUN huggingface-cli download BSC-LT/matcha-tts-cat-multispeaker config.yaml --local-dir $HOME/app/
46
- #RUN huggingface-cli download BSC-LT/vocos-mel-22khz-cat mel_spec_22khz_cat.onnx --local-dir $HOME/app/
47
- #RUN huggingface-cli download BSC-LT/vocos-mel-22khz-cat config.yaml --local-dir $HOME/app/
 
 
 
48
 
49
  COPY --chown=user . $HOME/app/
50
 
 
40
 
41
  RUN pip install -r requirements.txt
42
 
43
+ # download from hf hub
44
+ RUN huggingface-cli download BSC-LT/matcha-tts-cat-multiaccent matcha_multispeaker_cat_bal_opset_15_10_steps.onnx --local-dir $HOME/app/
45
+ RUN huggingface-cli download BSC-LT/matcha-tts-cat-multiaccent matcha_multispeaker_cat_occ_opset_15_10_steps.onnx --local-dir $HOME/app/
46
+ RUN huggingface-cli download BSC-LT/matcha-tts-cat-multiaccent matcha_multispeaker_cat_val_opset_15_10_steps.onnx --local-dir $HOME/app/
47
+ RUN huggingface-cli download BSC-LT/matcha-tts-cat-multiaccent config.yaml--local-dir $HOME/app/
48
+
49
+ RUN huggingface-cli download BSC-LT/vocos-mel-22khz-cat mel_spec_22khz_cat.onnx --local-dir $HOME/app/
50
+ RUN huggingface-cli download BSC-LT/vocos-mel-22khz-cat config.yaml --local-dir $HOME/app/
51
 
52
  COPY --chown=user . $HOME/app/
53
 
infer_onnx.py CHANGED
@@ -10,10 +10,11 @@ import yaml
10
  import json
11
  import os
12
 
13
- from huggingface_hub import hf_hub_download
14
  from time import perf_counter
 
15
 
16
- DEFAULT_SPEAKER_ID = os.environ.get("DEFAULT_SPEAKER_ID", default="caf_08106")
 
17
 
18
  def intersperse(lst, item):
19
  result = [item] * (len(lst) * 2 + 1)
@@ -21,10 +22,10 @@ def intersperse(lst, item):
21
  return result
22
 
23
 
24
- def process_text(i: int, text: str, device: torch.device):
25
  print(f"[{i}] - Input text: {text}")
26
  x = torch.tensor(
27
- intersperse(text_to_sequence(text, ["catalan_cleaners"]), 0),
28
  dtype=torch.long,
29
  device=device,
30
  )[None]
@@ -33,20 +34,36 @@ def process_text(i: int, text: str, device: torch.device):
33
  print(x_phones)
34
  return x.numpy(), x_lengths.numpy()
35
 
36
- MODEL_PATH_MATCHA_MEL=hf_hub_download(repo_id="BSC-LT/matcha-tts-cat-multispeaker", filename="matcha_multispeaker_cat_opset_15_10_steps_2399.onnx")
37
- MODEL_PATH_MATCHA="matcha_hifigan_multispeaker_cat.onnx"
38
- MODEL_PATH_VOCOS=hf_hub_download(repo_id="BSC-LT/vocos-mel-22khz-cat", filename="mel_spec_22khz_cat.onnx")
39
- CONFIG_PATH=hf_hub_download(repo_id="BSC-LT/vocos-mel-22khz-cat", filename="config.yaml")
40
- SPEAKER_ID_DICT="spk_to_id.json"
 
 
 
41
 
 
42
  sess_options = onnxruntime.SessionOptions()
43
- model_matcha_mel= onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA_MEL), sess_options=sess_options, providers=["CPUExecutionProvider"])
 
 
 
44
  model_vocos = onnxruntime.InferenceSession(str(MODEL_PATH_VOCOS), sess_options=sess_options, providers=["CPUExecutionProvider"])
45
- #model_matcha = onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA), sess_options=sess_options, providers=["CPUExecutionProvider"])
46
 
47
  speaker_id_dict = json.load(open(SPEAKER_ID_DICT))
48
- speakers = [sp for sp in speaker_id_dict.keys()]
49
- speakers.sort()
 
 
 
 
 
 
 
 
 
 
50
 
51
  def vocos_inference(mel,denoise):
52
 
@@ -131,10 +148,11 @@ def vocos_inference(mel,denoise):
131
  return y
132
 
133
 
134
- def tts(text:str, spk_name:str, temperature:float, length_scale:float, denoise:bool):
135
- spk_id = speaker_id_dict[spk_name]
136
  sid = np.array([int(spk_id)]) if spk_id is not None else None
137
- text_matcha , text_lengths = process_text(0,text,"cpu")
 
138
 
139
  # MATCHA VOCOS
140
  inputs = {
@@ -158,15 +176,6 @@ def tts(text:str, spk_name:str, temperature:float, length_scale:float, denoise:b
158
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False, dir="/home/user/app") as fp_matcha_vocos:
159
  sf.write(fp_matcha_vocos.name, wavs_vocos.squeeze(0), 22050, "PCM_24")
160
 
161
- #MATCHA HIFIGAN
162
-
163
- inputs = {
164
- "x": text_matcha,
165
- "x_lengths": text_lengths,
166
- "scales": np.array([temperature, length_scale], dtype=np.float32),
167
- "spks": sid
168
- }
169
- hifigan_t0 = perf_counter()
170
  print(f"RTF matcha + vocos { (mel_infer_secs + vocos_infer_secs) / (wavs_vocos.shape[1]/22050) }")
171
  return fp_matcha_vocos.name
172
 
@@ -197,7 +206,27 @@ with open("about.md", "r", encoding="utf-8") as f:
197
 
198
  article = "Training and demo by The Language Technologies Unit from Barcelona Supercomputing Center."
199
 
200
- vits2_inference = gr.Interface(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  fn=tts,
202
  inputs=[
203
  gr.Textbox(
@@ -205,12 +234,8 @@ vits2_inference = gr.Interface(
205
  max_lines=1,
206
  label="Input text",
207
  ),
208
- gr.Dropdown(
209
- choices=speakers,
210
- label="Speaker id",
211
- value=DEFAULT_SPEAKER_ID,
212
- info=f"Models are trained on 47 speakers. You can prompt the model using one of these speaker ids."
213
- ),
214
  gr.Slider(
215
  0.1,
216
  2.0,
@@ -239,7 +264,8 @@ demo = gr.Blocks()
239
  with demo:
240
  gr.Markdown(title)
241
  gr.Markdown(description)
242
- gr.TabbedInterface([vits2_inference, about_article], ["Demo", "About"])
 
243
  gr.Markdown(article)
244
 
245
  demo.queue(max_size=10)
 
10
  import json
11
  import os
12
 
 
13
  from time import perf_counter
14
+ import random
15
 
16
+ DEFAULT_SPEAKER_ID = os.environ.get("DEFAULT_SPEAKER_ID", default="quim")
17
+ DEFAULT_ACCENT= os.environ.get("DEFAULT_ACCENT", default="balear")
18
 
19
  def intersperse(lst, item):
20
  result = [item] * (len(lst) * 2 + 1)
 
22
  return result
23
 
24
 
25
+ def process_text(i: int, text: str, device: torch.device, cleaner:str):
26
  print(f"[{i}] - Input text: {text}")
27
  x = torch.tensor(
28
+ intersperse(text_to_sequence(text, [cleaner]), 0),
29
  dtype=torch.long,
30
  device=device,
31
  )[None]
 
34
  print(x_phones)
35
  return x.numpy(), x_lengths.numpy()
36
 
37
+ # paths
38
+ MODEL_PATH_MATCHA_MEL_BAL="matcha_multispeaker_cat_bal_opset_15_10_steps.onnx"
39
+ #MODEL_PATH_MATCHA_MEL_CAT=hf_hub_download(repo_id="BSC-LT/matcha-tts-cat-multispeaker", filename="matcha_multispeaker_cat_opset_15_10_steps_2399.onnx")
40
+ MODEL_PATH_MATCHA_MEL_OCC="matcha_multispeaker_cat_occ_opset_15_10_steps.onnx"
41
+ MODEL_PATH_MATCHA_MEL_VAL="matcha_multispeaker_cat_val_opset_15_10_steps.onnx"
42
+ MODEL_PATH_VOCOS="mel_spec_22khz_cat.onnx"
43
+ CONFIG_PATH="config.yaml"
44
+ SPEAKER_ID_DICT="spk_to_id_2.json"
45
 
46
+ # Load models
47
  sess_options = onnxruntime.SessionOptions()
48
+ model_matcha_mel_bal = onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA_MEL_BAL), sess_options=sess_options, providers=["CPUExecutionProvider"])
49
+ #model_matcha_mel_cat = onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA_MEL_CAT), sess_options=sess_options, providers=["CPUExecutionProvider"])
50
+ model_matcha_mel_occ = onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA_MEL_OCC), sess_options=sess_options, providers=["CPUExecutionProvider"])
51
+ model_matcha_mel_val = onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA_MEL_VAL), sess_options=sess_options, providers=["CPUExecutionProvider"])
52
  model_vocos = onnxruntime.InferenceSession(str(MODEL_PATH_VOCOS), sess_options=sess_options, providers=["CPUExecutionProvider"])
 
53
 
54
  speaker_id_dict = json.load(open(SPEAKER_ID_DICT))
55
+ accents = [e for e in speaker_id_dict.keys()]
56
+
57
+ models={"balear":model_matcha_mel_bal,
58
+ "occidental": model_matcha_mel_occ,
59
+ "valencia": model_matcha_mel_val}
60
+
61
+ cleaners={"balear": "catalan_balear_cleaners",
62
+ "occidental": "catalan_occidental_cleaners",
63
+ "valencia": "catalan_valencia_cleaners"}
64
+
65
+
66
+ speakers = [sp for sp in speaker_id_dict[DEFAULT_ACCENT].keys()]
67
 
68
  def vocos_inference(mel,denoise):
69
 
 
148
  return y
149
 
150
 
151
+ def tts(text:str, accent:str, spk_name:str, temperature:float, length_scale:float, denoise:bool):
152
+ spk_id = speaker_id_dict[accent][spk_name]
153
  sid = np.array([int(spk_id)]) if spk_id is not None else None
154
+ text_matcha , text_lengths = process_text(0,text,"cpu",cleaner=cleaners[accent])
155
+ model_matcha_mel = models[accent]
156
 
157
  # MATCHA VOCOS
158
  inputs = {
 
176
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False, dir="/home/user/app") as fp_matcha_vocos:
177
  sf.write(fp_matcha_vocos.name, wavs_vocos.squeeze(0), 22050, "PCM_24")
178
 
 
 
 
 
 
 
 
 
 
179
  print(f"RTF matcha + vocos { (mel_infer_secs + vocos_infer_secs) / (wavs_vocos.shape[1]/22050) }")
180
  return fp_matcha_vocos.name
181
 
 
206
 
207
  article = "Training and demo by The Language Technologies Unit from Barcelona Supercomputing Center."
208
 
209
+
210
+ def rs_change(accent):
211
+ rnd_idx = random.randint(0, 1)
212
+ return gr.Dropdown(choices=speaker_id_dict[accent], interactive=True,value=list(speaker_id_dict[accent].keys())[rnd_idx])
213
+
214
+ accent_dropdown = gr.Dropdown(
215
+ choices=accents,
216
+ label="Accent",
217
+ value=DEFAULT_ACCENT,
218
+ info=f"Models are trained on 4 accents"
219
+ )
220
+
221
+ speaker_dropdown = gr.Dropdown(
222
+ choices=speaker_id_dict[DEFAULT_ACCENT],
223
+ label="Speaker id",
224
+ value=DEFAULT_SPEAKER_ID,
225
+ info=f"Models are trained on 2 speakers. You can prompt the model using one of these speaker ids.",
226
+ interactive=True
227
+ )
228
+
229
+ matcha_inference = gr.Interface(
230
  fn=tts,
231
  inputs=[
232
  gr.Textbox(
 
234
  max_lines=1,
235
  label="Input text",
236
  ),
237
+ accent_dropdown,
238
+ speaker_dropdown,
 
 
 
 
239
  gr.Slider(
240
  0.1,
241
  2.0,
 
264
  with demo:
265
  gr.Markdown(title)
266
  gr.Markdown(description)
267
+ gr.TabbedInterface([matcha_inference, about_article], ["Demo", "About"])
268
+ accent_dropdown.select(fn=rs_change, inputs=accent_dropdown, outputs=speaker_dropdown)
269
  gr.Markdown(article)
270
 
271
  demo.queue(max_size=10)
spk_to_id_2.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "balear":{
3
+ "quim": 0,
4
+ "olga": 1
5
+ },
6
+ "central":{
7
+ "grau": 0,
8
+ "elia": 1
9
+ },
10
+ "occidental":{
11
+ "pere": 0,
12
+ "emma": 1
13
+ },
14
+ "valencia":{
15
+ "lluc": 0,
16
+ "gina": 1
17
+ }
18
+ }
text/cleaners.py CHANGED
@@ -17,8 +17,10 @@ from unidecode import unidecode
17
  from phonemizer import phonemize
18
  from phonemizer.backend import EspeakBackend
19
 
20
- backend = EspeakBackend("ca", preserve_punctuation=True, with_stress=True)
21
- backend_en = EspeakBackend("en-us", preserve_punctuation=True, with_stress=True)
 
 
22
 
23
  # Regular expression matching whitespace:
24
  _whitespace_re = re.compile(r"\s+")
@@ -113,21 +115,43 @@ def english_cleaners2(text):
113
  return phonemes
114
 
115
 
116
- def english_cleaners3(text):
117
- """Pipeline for English text, including abbreviation expansion. + punctuation + stress"""
118
- text = convert_to_ascii(text)
119
  text = lowercase(text)
120
- text = expand_abbreviations(text)
121
- phonemes = backend_en.phonemize([text], strip=True)[0]
 
 
 
 
 
 
 
 
 
122
  phonemes = collapse_whitespace(phonemes)
 
123
  return phonemes
124
 
 
 
 
 
 
 
 
 
 
125
 
126
- def catalan_cleaners(text):
127
- """Pipeline for catalan text, including punctuation + stress"""
128
- #text = convert_to_ascii(text)
129
  text = lowercase(text)
130
- #text = expand_abbreviations(text)
131
- phonemes = backend.phonemize([text], strip=True)[0]
132
  phonemes = collapse_whitespace(phonemes)
 
133
  return phonemes
 
 
 
17
  from phonemizer import phonemize
18
  from phonemizer.backend import EspeakBackend
19
 
20
+ backend_cat = EspeakBackend("ca", preserve_punctuation=True, with_stress=True)
21
+ backend_bal = EspeakBackend("ca-ba", preserve_punctuation=True, with_stress=True)
22
+ backend_val = EspeakBackend("ca-va", preserve_punctuation=True, with_stress=True)
23
+ backend_occ = EspeakBackend("ca-nw", preserve_punctuation=True, with_stress=True)
24
 
25
  # Regular expression matching whitespace:
26
  _whitespace_re = re.compile(r"\s+")
 
115
  return phonemes
116
 
117
 
118
+ def catalan_cleaners(text):
119
+ """Pipeline for catalan text, including punctuation + stress"""
120
+ #text = convert_to_ascii(text)
121
  text = lowercase(text)
122
+ #text = expand_abbreviations(text)
123
+ phonemes = backend_cat.phonemize([text], strip=True)[0]
124
+ phonemes = collapse_whitespace(phonemes)
125
+ return phonemes
126
+
127
+ def catalan_balear_cleaners(text):
128
+ """Pipeline for Catalan text, including abbreviation expansion. + punctuation + stress"""
129
+ # text = convert_to_ascii(text)
130
+ text = lowercase(text)
131
+ # text = expand_abbreviations(text)
132
+ phonemes = backend_bal.phonemize([text], strip=True, njobs=1)[0]
133
  phonemes = collapse_whitespace(phonemes)
134
+ # print(phonemes) # check punctuations!!
135
  return phonemes
136
 
137
+ def catalan_occidental_cleaners(text):
138
+ """Pipeline for Catalan text, including abbreviation expansion. + punctuation + stress"""
139
+ # text = convert_to_ascii(text)
140
+ text = lowercase(text)
141
+ # text = expand_abbreviations(text)
142
+ phonemes = backend_occ.phonemize([text], strip=True, njobs=1)[0]
143
+ phonemes = collapse_whitespace(phonemes)
144
+ # print(phonemes) # check punctuations!!
145
+ return phonemes
146
 
147
+ def catalan_valencia_cleaners(text):
148
+ """Pipeline for Catalan text, including abbreviation expansion. + punctuation + stress"""
149
+ # text = convert_to_ascii(text)
150
  text = lowercase(text)
151
+ # text = expand_abbreviations(text)
152
+ phonemes = backend_val.phonemize([text], strip=True, njobs=1)[0]
153
  phonemes = collapse_whitespace(phonemes)
154
+ # print(phonemes) # check punctuations!!
155
  return phonemes
156
+
157
+