Upload 67 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- app.py +281 -85
- descriptions.py +3 -4
- macros.py +39 -0
- presets/acou_1.npz +3 -0
- presets/acou_2.npz +3 -0
- presets/acou_3.npz +3 -0
- presets/acou_4.npz +3 -0
- presets/amused.npz +3 -0
- presets/anger.npz +3 -0
- presets/babara.npz +3 -0
- presets/bronya_1.npz +3 -0
- presets/dingzhen.npz +3 -0
- presets/disgust.npz +3 -0
- presets/emo_amused.npz +3 -0
- presets/emo_anger.npz +3 -0
- presets/emo_neutral.npz +3 -0
- presets/emo_sleepy.npz +3 -0
- presets/en2zh_tts_1.npz +3 -0
- presets/en2zh_tts_2.npz +3 -0
- presets/en2zh_tts_3.npz +3 -0
- presets/en2zh_tts_4.npz +3 -0
- presets/fuxuan_2.npz +3 -0
- presets/librispeech_1.npz +3 -0
- presets/librispeech_2.npz +3 -0
- presets/librispeech_3.npz +3 -0
- presets/librispeech_4.npz +3 -0
- presets/neutral.npz +3 -0
- presets/paimon_1.npz +3 -0
- presets/rosalia.npz +3 -0
- presets/seel.npz +3 -0
- presets/sleepiness.npz +3 -0
- presets/vctk_1.npz +3 -0
- presets/vctk_2.npz +3 -0
- presets/vctk_3.npz +3 -0
- presets/vctk_4.npz +3 -0
- presets/yaesakura.npz +3 -0
- presets/zh2en_tts_1.npz +3 -0
- presets/zh2en_tts_2.npz +3 -0
- presets/zh2en_tts_3.npz +3 -0
- presets/zh2en_tts_4.npz +3 -0
- requirements.txt +9 -3
- utils/__pycache__/__init__.cpython-38.pyc +0 -0
- utils/g2p/__pycache__/__init__.cpython-38.pyc +0 -0
- utils/g2p/__pycache__/cleaners.cpython-38.pyc +0 -0
- utils/g2p/__pycache__/english.cpython-38.pyc +0 -0
- utils/g2p/__pycache__/japanese.cpython-38.pyc +0 -0
- utils/g2p/__pycache__/mandarin.cpython-38.pyc +0 -0
- utils/g2p/__pycache__/symbols.cpython-38.pyc +0 -0
- utils/generation.py +256 -0
- utils/prompt_making.py +115 -0
app.py
CHANGED
@@ -4,10 +4,18 @@ import os
|
|
4 |
import pathlib
|
5 |
import time
|
6 |
import tempfile
|
7 |
-
|
8 |
-
|
9 |
-
|
|
|
|
|
|
|
|
|
10 |
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
|
|
|
|
|
|
|
|
|
11 |
import torch
|
12 |
import torchaudio
|
13 |
import random
|
@@ -22,48 +30,21 @@ from data.collation import get_text_token_collater
|
|
22 |
from models.vallex import VALLE
|
23 |
from utils.g2p import PhonemeBpeTokenizer
|
24 |
from descriptions import *
|
|
|
25 |
|
26 |
import gradio as gr
|
27 |
import whisper
|
28 |
-
|
29 |
-
torch.set_num_interop_threads(1)
|
30 |
-
torch._C._jit_set_profiling_executor(False)
|
31 |
-
torch._C._jit_set_profiling_mode(False)
|
32 |
-
torch._C._set_graph_executor_optimize(False)
|
33 |
-
# torch.manual_seed(42)
|
34 |
-
|
35 |
-
lang2token = {
|
36 |
-
'zh': "[ZH]",
|
37 |
-
'ja': "[JA]",
|
38 |
-
"en": "[EN]",
|
39 |
-
}
|
40 |
-
|
41 |
-
lang2code = {
|
42 |
-
'zh': 0,
|
43 |
-
'ja': 1,
|
44 |
-
"en": 2,
|
45 |
-
}
|
46 |
-
|
47 |
-
token2lang = {
|
48 |
-
'[ZH]': "zh",
|
49 |
-
'[JA]': "ja",
|
50 |
-
"[EN]": "en",
|
51 |
-
}
|
52 |
-
|
53 |
-
code2lang = {
|
54 |
-
0: 'zh',
|
55 |
-
1: 'ja',
|
56 |
-
2: "en",
|
57 |
-
}
|
58 |
|
|
|
59 |
|
|
|
60 |
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
}
|
67 |
|
68 |
text_tokenizer = PhonemeBpeTokenizer(tokenizer_path="./utils/g2p/bpe_69.json")
|
69 |
text_collater = get_text_token_collater()
|
@@ -74,30 +55,33 @@ if torch.cuda.is_available():
|
|
74 |
|
75 |
# VALL-E-X model
|
76 |
model = VALLE(
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
)
|
88 |
checkpoint = torch.load("./epoch-10.pt", map_location='cpu')
|
89 |
missing_keys, unexpected_keys = model.load_state_dict(
|
90 |
checkpoint["model"], strict=True
|
91 |
)
|
92 |
assert not missing_keys
|
93 |
-
model.to('cpu')
|
94 |
model.eval()
|
95 |
|
96 |
# Encodec model
|
97 |
audio_tokenizer = AudioTokenizer(device)
|
98 |
|
99 |
# ASR
|
100 |
-
whisper_model = whisper.load_model("medium")
|
|
|
|
|
|
|
|
|
101 |
|
102 |
def clear_prompts():
|
103 |
try:
|
@@ -136,24 +120,38 @@ def transcribe_one(model, audio_path):
|
|
136 |
text_pr += "."
|
137 |
return lang, text_pr
|
138 |
|
139 |
-
def make_npz_prompt(name, uploaded_audio, recorded_audio):
|
140 |
global model, text_collater, text_tokenizer, audio_tokenizer
|
141 |
clear_prompts()
|
142 |
audio_prompt = uploaded_audio if uploaded_audio is not None else recorded_audio
|
143 |
sr, wav_pr = audio_prompt
|
144 |
-
|
|
|
|
|
|
|
|
|
|
|
145 |
if wav_pr.size(-1) == 2:
|
146 |
wav_pr = wav_pr.mean(-1, keepdim=False)
|
147 |
-
|
148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
# tokenize audio
|
150 |
-
encoded_frames = tokenize_audio(audio_tokenizer, (wav_pr
|
151 |
audio_tokens = encoded_frames[0][0].transpose(2, 1).cpu().numpy()
|
152 |
|
153 |
# tokenize text
|
|
|
154 |
text_tokens, enroll_x_lens = text_collater(
|
155 |
[
|
156 |
-
|
157 |
]
|
158 |
)
|
159 |
|
@@ -166,8 +164,8 @@ def make_npz_prompt(name, uploaded_audio, recorded_audio):
|
|
166 |
|
167 |
|
168 |
def make_prompt(name, wav, sr, save=True):
|
169 |
-
|
170 |
global whisper_model
|
|
|
171 |
if not isinstance(wav, torch.FloatTensor):
|
172 |
wav = torch.tensor(wav)
|
173 |
if wav.abs().max() > 1:
|
@@ -187,19 +185,41 @@ def make_prompt(name, wav, sr, save=True):
|
|
187 |
os.remove(f"./prompts/{name}.wav")
|
188 |
os.remove(f"./prompts/{name}.txt")
|
189 |
|
|
|
190 |
torch.cuda.empty_cache()
|
191 |
return text, lang
|
192 |
|
193 |
@torch.no_grad()
|
194 |
-
def infer_from_audio(text, language, accent, audio_prompt, record_audio_prompt):
|
|
|
|
|
195 |
global model, text_collater, text_tokenizer, audio_tokenizer
|
|
|
196 |
audio_prompt = audio_prompt if audio_prompt is not None else record_audio_prompt
|
197 |
sr, wav_pr = audio_prompt
|
198 |
-
|
|
|
|
|
|
|
|
|
|
|
199 |
if wav_pr.size(-1) == 2:
|
200 |
wav_pr = wav_pr.mean(-1, keepdim=False)
|
201 |
-
|
202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
lang = token2lang[lang_token]
|
204 |
text = lang_token + text + lang_token
|
205 |
|
@@ -207,24 +227,28 @@ def infer_from_audio(text, language, accent, audio_prompt, record_audio_prompt):
|
|
207 |
model.to(device)
|
208 |
|
209 |
# tokenize audio
|
210 |
-
encoded_frames = tokenize_audio(audio_tokenizer, (wav_pr
|
211 |
audio_prompts = encoded_frames[0][0].transpose(2, 1).to(device)
|
212 |
|
213 |
# tokenize text
|
214 |
logging.info(f"synthesize text: {text}")
|
|
|
215 |
text_tokens, text_tokens_lens = text_collater(
|
216 |
[
|
217 |
-
|
218 |
]
|
219 |
)
|
220 |
|
221 |
enroll_x_lens = None
|
222 |
if text_pr:
|
223 |
-
|
|
|
224 |
[
|
225 |
-
|
226 |
]
|
227 |
)
|
|
|
|
|
228 |
lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
|
229 |
encoded_frames = model.inference(
|
230 |
text_tokens.to(device),
|
@@ -234,7 +258,7 @@ def infer_from_audio(text, language, accent, audio_prompt, record_audio_prompt):
|
|
234 |
top_k=-100,
|
235 |
temperature=1,
|
236 |
prompt_language=lang_pr,
|
237 |
-
text_language=lang,
|
238 |
)
|
239 |
samples = audio_tokenizer.decode(
|
240 |
[(encoded_frames.transpose(2, 1), None)]
|
@@ -248,17 +272,24 @@ def infer_from_audio(text, language, accent, audio_prompt, record_audio_prompt):
|
|
248 |
return message, (24000, samples[0][0].cpu().numpy())
|
249 |
|
250 |
@torch.no_grad()
|
251 |
-
def infer_from_prompt(text, language, accent, prompt_file):
|
252 |
-
|
253 |
-
|
254 |
clear_prompts()
|
|
|
255 |
# text to synthesize
|
256 |
-
|
|
|
|
|
|
|
257 |
lang = token2lang[lang_token]
|
258 |
text = lang_token + text + lang_token
|
259 |
|
260 |
# load prompt
|
261 |
-
|
|
|
|
|
|
|
262 |
audio_prompts = prompt_data['audio_tokens']
|
263 |
text_prompts = prompt_data['text_tokens']
|
264 |
lang_pr = prompt_data['lang_code']
|
@@ -270,9 +301,10 @@ def infer_from_prompt(text, language, accent, prompt_file):
|
|
270 |
|
271 |
enroll_x_lens = text_prompts.shape[-1]
|
272 |
logging.info(f"synthesize text: {text}")
|
|
|
273 |
text_tokens, text_tokens_lens = text_collater(
|
274 |
[
|
275 |
-
|
276 |
]
|
277 |
)
|
278 |
text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
|
@@ -287,13 +319,11 @@ def infer_from_prompt(text, language, accent, prompt_file):
|
|
287 |
top_k=-100,
|
288 |
temperature=1,
|
289 |
prompt_language=lang_pr,
|
290 |
-
text_language=lang,
|
291 |
)
|
292 |
samples = audio_tokenizer.decode(
|
293 |
[(encoded_frames.transpose(2, 1), None)]
|
294 |
)
|
295 |
-
|
296 |
-
# offload model
|
297 |
model.to('cpu')
|
298 |
torch.cuda.empty_cache()
|
299 |
|
@@ -301,6 +331,144 @@ def infer_from_prompt(text, language, accent, prompt_file):
|
|
301 |
return message, (24000, samples[0][0].cpu().numpy())
|
302 |
|
303 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
304 |
def main():
|
305 |
app = gr.Blocks()
|
306 |
with app:
|
@@ -312,9 +480,12 @@ def main():
|
|
312 |
|
313 |
textbox = gr.TextArea(label="Text",
|
314 |
placeholder="Type your sentence here",
|
315 |
-
value="
|
316 |
-
language_dropdown = gr.Dropdown(choices=['English', '中文', '日本語'], value='English', label='
|
317 |
accent_dropdown = gr.Dropdown(choices=['no-accent', 'English', '中文', '日本語'], value='no-accent', label='accent')
|
|
|
|
|
|
|
318 |
upload_audio_prompt = gr.Audio(label='uploaded audio prompt', source='upload', interactive=True)
|
319 |
record_audio_prompt = gr.Audio(label='recorded audio prompt', source='microphone', interactive=True)
|
320 |
with gr.Column():
|
@@ -322,7 +493,7 @@ def main():
|
|
322 |
audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
|
323 |
btn = gr.Button("Generate!")
|
324 |
btn.click(infer_from_audio,
|
325 |
-
inputs=[textbox, language_dropdown, accent_dropdown, upload_audio_prompt, record_audio_prompt],
|
326 |
outputs=[text_output, audio_output])
|
327 |
textbox_mp = gr.TextArea(label="Prompt name",
|
328 |
placeholder="Name your prompt here",
|
@@ -330,7 +501,7 @@ def main():
|
|
330 |
btn_mp = gr.Button("Make prompt!")
|
331 |
prompt_output = gr.File(interactive=False)
|
332 |
btn_mp.click(make_npz_prompt,
|
333 |
-
inputs=[textbox_mp, upload_audio_prompt, record_audio_prompt],
|
334 |
outputs=[text_output, prompt_output])
|
335 |
with gr.Tab("Make prompt"):
|
336 |
gr.Markdown(make_prompt_md)
|
@@ -339,6 +510,10 @@ def main():
|
|
339 |
textbox2 = gr.TextArea(label="Prompt name",
|
340 |
placeholder="Name your prompt here",
|
341 |
value="prompt_1", elem_id=f"prompt-name")
|
|
|
|
|
|
|
|
|
342 |
upload_audio_prompt_2 = gr.Audio(label='uploaded audio prompt', source='upload', interactive=True)
|
343 |
record_audio_prompt_2 = gr.Audio(label='recorded audio prompt', source='microphone', interactive=True)
|
344 |
with gr.Column():
|
@@ -346,7 +521,7 @@ def main():
|
|
346 |
prompt_output_2 = gr.File(interactive=False)
|
347 |
btn_2 = gr.Button("Make!")
|
348 |
btn_2.click(make_npz_prompt,
|
349 |
-
inputs=[textbox2, upload_audio_prompt_2, record_audio_prompt_2],
|
350 |
outputs=[text_output_2, prompt_output_2])
|
351 |
with gr.Tab("Infer from prompt"):
|
352 |
gr.Markdown(infer_from_prompt_md)
|
@@ -354,19 +529,40 @@ def main():
|
|
354 |
with gr.Column():
|
355 |
textbox_3 = gr.TextArea(label="Text",
|
356 |
placeholder="Type your sentence here",
|
357 |
-
value="
|
358 |
-
language_dropdown_3 = gr.Dropdown(choices=['English', '中文', '日本語'], value='
|
359 |
label='language')
|
360 |
accent_dropdown_3 = gr.Dropdown(choices=['no-accent', 'English', '中文', '日本語'], value='no-accent',
|
361 |
label='accent')
|
|
|
362 |
prompt_file = gr.File(file_count='single', file_types=['.npz'], interactive=True)
|
363 |
with gr.Column():
|
364 |
text_output_3 = gr.Textbox(label="Message")
|
365 |
audio_output_3 = gr.Audio(label="Output Audio", elem_id="tts-audio")
|
366 |
btn_3 = gr.Button("Generate!")
|
367 |
btn_3.click(infer_from_prompt,
|
368 |
-
inputs=[textbox_3, language_dropdown_3, accent_dropdown_3, prompt_file],
|
369 |
outputs=[text_output_3, audio_output_3])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
370 |
|
371 |
app.launch()
|
372 |
|
|
|
4 |
import pathlib
|
5 |
import time
|
6 |
import tempfile
|
7 |
+
import platform
|
8 |
+
if platform.system().lower() == 'windows':
|
9 |
+
temp = pathlib.PosixPath
|
10 |
+
pathlib.PosixPath = pathlib.WindowsPath
|
11 |
+
elif platform.system().lower() == 'linux':
|
12 |
+
temp = pathlib.WindowsPath
|
13 |
+
pathlib.WindowsPath = pathlib.PosixPath
|
14 |
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
|
15 |
+
|
16 |
+
import langid
|
17 |
+
langid.set_languages(['en', 'zh', 'ja'])
|
18 |
+
|
19 |
import torch
|
20 |
import torchaudio
|
21 |
import random
|
|
|
30 |
from models.vallex import VALLE
|
31 |
from utils.g2p import PhonemeBpeTokenizer
|
32 |
from descriptions import *
|
33 |
+
from macros import *
|
34 |
|
35 |
import gradio as gr
|
36 |
import whisper
|
37 |
+
import multiprocessing
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
+
thread_count = multiprocessing.cpu_count()
|
40 |
|
41 |
+
print("Use",thread_count,"cpu cores for computing")
|
42 |
|
43 |
+
torch.set_num_threads(thread_count)
|
44 |
+
torch.set_num_interop_threads(thread_count)
|
45 |
+
torch._C._jit_set_profiling_executor(False)
|
46 |
+
torch._C._jit_set_profiling_mode(False)
|
47 |
+
torch._C._set_graph_executor_optimize(False)
|
|
|
48 |
|
49 |
text_tokenizer = PhonemeBpeTokenizer(tokenizer_path="./utils/g2p/bpe_69.json")
|
50 |
text_collater = get_text_token_collater()
|
|
|
55 |
|
56 |
# VALL-E-X model
|
57 |
model = VALLE(
|
58 |
+
N_DIM,
|
59 |
+
NUM_HEAD,
|
60 |
+
NUM_LAYERS,
|
61 |
+
norm_first=True,
|
62 |
+
add_prenet=False,
|
63 |
+
prefix_mode=PREFIX_MODE,
|
64 |
+
share_embedding=True,
|
65 |
+
nar_scale_factor=1.0,
|
66 |
+
prepend_bos=True,
|
67 |
+
num_quantizers=NUM_QUANTIZERS,
|
68 |
+
)
|
69 |
checkpoint = torch.load("./epoch-10.pt", map_location='cpu')
|
70 |
missing_keys, unexpected_keys = model.load_state_dict(
|
71 |
checkpoint["model"], strict=True
|
72 |
)
|
73 |
assert not missing_keys
|
|
|
74 |
model.eval()
|
75 |
|
76 |
# Encodec model
|
77 |
audio_tokenizer = AudioTokenizer(device)
|
78 |
|
79 |
# ASR
|
80 |
+
whisper_model = whisper.load_model("medium").cpu()
|
81 |
+
|
82 |
+
# Voice Presets
|
83 |
+
preset_list = os.walk("./presets/").__next__()[2]
|
84 |
+
preset_list = [preset[:-4] for preset in preset_list if preset.endswith(".npz")]
|
85 |
|
86 |
def clear_prompts():
|
87 |
try:
|
|
|
120 |
text_pr += "."
|
121 |
return lang, text_pr
|
122 |
|
123 |
+
def make_npz_prompt(name, uploaded_audio, recorded_audio, transcript_content):
|
124 |
global model, text_collater, text_tokenizer, audio_tokenizer
|
125 |
clear_prompts()
|
126 |
audio_prompt = uploaded_audio if uploaded_audio is not None else recorded_audio
|
127 |
sr, wav_pr = audio_prompt
|
128 |
+
if len(wav_pr) / sr > 15:
|
129 |
+
return "Rejected, Audio too long (should be less than 15 seconds)", None
|
130 |
+
if not isinstance(wav_pr, torch.FloatTensor):
|
131 |
+
wav_pr = torch.FloatTensor(wav_pr)
|
132 |
+
if wav_pr.abs().max() > 1:
|
133 |
+
wav_pr /= wav_pr.abs().max()
|
134 |
if wav_pr.size(-1) == 2:
|
135 |
wav_pr = wav_pr.mean(-1, keepdim=False)
|
136 |
+
if wav_pr.ndim == 1:
|
137 |
+
wav_pr = wav_pr.unsqueeze(0)
|
138 |
+
assert wav_pr.ndim and wav_pr.size(0) == 1
|
139 |
+
|
140 |
+
if transcript_content == "":
|
141 |
+
text_pr, lang_pr = make_prompt(name, wav_pr, sr, save=False)
|
142 |
+
else:
|
143 |
+
lang_pr = langid.classify(str(transcript_content))[0]
|
144 |
+
lang_token = lang2token[lang_pr]
|
145 |
+
text_pr = f"{lang_token}{str(transcript_content)}{lang_token}"
|
146 |
# tokenize audio
|
147 |
+
encoded_frames = tokenize_audio(audio_tokenizer, (wav_pr, sr))
|
148 |
audio_tokens = encoded_frames[0][0].transpose(2, 1).cpu().numpy()
|
149 |
|
150 |
# tokenize text
|
151 |
+
phonemes, _ = text_tokenizer.tokenize(text=f"{text_pr}".strip())
|
152 |
text_tokens, enroll_x_lens = text_collater(
|
153 |
[
|
154 |
+
phonemes
|
155 |
]
|
156 |
)
|
157 |
|
|
|
164 |
|
165 |
|
166 |
def make_prompt(name, wav, sr, save=True):
|
|
|
167 |
global whisper_model
|
168 |
+
whisper_model.to(device)
|
169 |
if not isinstance(wav, torch.FloatTensor):
|
170 |
wav = torch.tensor(wav)
|
171 |
if wav.abs().max() > 1:
|
|
|
185 |
os.remove(f"./prompts/{name}.wav")
|
186 |
os.remove(f"./prompts/{name}.txt")
|
187 |
|
188 |
+
whisper_model.cpu()
|
189 |
torch.cuda.empty_cache()
|
190 |
return text, lang
|
191 |
|
192 |
@torch.no_grad()
|
193 |
+
def infer_from_audio(text, language, accent, audio_prompt, record_audio_prompt, transcript_content):
|
194 |
+
if len(text) > 150:
|
195 |
+
return "Rejected, Text too long (should be less than 150 characters)", None
|
196 |
global model, text_collater, text_tokenizer, audio_tokenizer
|
197 |
+
model.to(device)
|
198 |
audio_prompt = audio_prompt if audio_prompt is not None else record_audio_prompt
|
199 |
sr, wav_pr = audio_prompt
|
200 |
+
if len(wav_pr) / sr > 15:
|
201 |
+
return "Rejected, Audio too long (should be less than 15 seconds)", None
|
202 |
+
if not isinstance(wav_pr, torch.FloatTensor):
|
203 |
+
wav_pr = torch.FloatTensor(wav_pr)
|
204 |
+
if wav_pr.abs().max() > 1:
|
205 |
+
wav_pr /= wav_pr.abs().max()
|
206 |
if wav_pr.size(-1) == 2:
|
207 |
wav_pr = wav_pr.mean(-1, keepdim=False)
|
208 |
+
if wav_pr.ndim == 1:
|
209 |
+
wav_pr = wav_pr.unsqueeze(0)
|
210 |
+
assert wav_pr.ndim and wav_pr.size(0) == 1
|
211 |
+
|
212 |
+
if transcript_content == "":
|
213 |
+
text_pr, lang_pr = make_prompt('dummy', wav_pr, sr, save=False)
|
214 |
+
else:
|
215 |
+
lang_pr = langid.classify(str(transcript_content))[0]
|
216 |
+
lang_token = lang2token[lang_pr]
|
217 |
+
text_pr = f"{lang_token}{str(transcript_content)}{lang_token}"
|
218 |
+
|
219 |
+
if language == 'auto-detect':
|
220 |
+
lang_token = lang2token[langid.classify(text)[0]]
|
221 |
+
else:
|
222 |
+
lang_token = langdropdown2token[language]
|
223 |
lang = token2lang[lang_token]
|
224 |
text = lang_token + text + lang_token
|
225 |
|
|
|
227 |
model.to(device)
|
228 |
|
229 |
# tokenize audio
|
230 |
+
encoded_frames = tokenize_audio(audio_tokenizer, (wav_pr, sr))
|
231 |
audio_prompts = encoded_frames[0][0].transpose(2, 1).to(device)
|
232 |
|
233 |
# tokenize text
|
234 |
logging.info(f"synthesize text: {text}")
|
235 |
+
phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip())
|
236 |
text_tokens, text_tokens_lens = text_collater(
|
237 |
[
|
238 |
+
phone_tokens
|
239 |
]
|
240 |
)
|
241 |
|
242 |
enroll_x_lens = None
|
243 |
if text_pr:
|
244 |
+
text_prompts, _ = text_tokenizer.tokenize(text=f"{text_pr}".strip())
|
245 |
+
text_prompts, enroll_x_lens = text_collater(
|
246 |
[
|
247 |
+
text_prompts
|
248 |
]
|
249 |
)
|
250 |
+
text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
|
251 |
+
text_tokens_lens += enroll_x_lens
|
252 |
lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
|
253 |
encoded_frames = model.inference(
|
254 |
text_tokens.to(device),
|
|
|
258 |
top_k=-100,
|
259 |
temperature=1,
|
260 |
prompt_language=lang_pr,
|
261 |
+
text_language=langs if accent == "no-accent" else lang,
|
262 |
)
|
263 |
samples = audio_tokenizer.decode(
|
264 |
[(encoded_frames.transpose(2, 1), None)]
|
|
|
272 |
return message, (24000, samples[0][0].cpu().numpy())
|
273 |
|
274 |
@torch.no_grad()
|
275 |
+
def infer_from_prompt(text, language, accent, preset_prompt, prompt_file):
|
276 |
+
if len(text) > 150:
|
277 |
+
return "Rejected, Text too long (should be less than 150 characters)", None
|
278 |
clear_prompts()
|
279 |
+
model.to(device)
|
280 |
# text to synthesize
|
281 |
+
if language == 'auto-detect':
|
282 |
+
lang_token = lang2token[langid.classify(text)[0]]
|
283 |
+
else:
|
284 |
+
lang_token = langdropdown2token[language]
|
285 |
lang = token2lang[lang_token]
|
286 |
text = lang_token + text + lang_token
|
287 |
|
288 |
# load prompt
|
289 |
+
if prompt_file is not None:
|
290 |
+
prompt_data = np.load(prompt_file.name)
|
291 |
+
else:
|
292 |
+
prompt_data = np.load(os.path.join("./presets/", f"{preset_prompt}.npz"))
|
293 |
audio_prompts = prompt_data['audio_tokens']
|
294 |
text_prompts = prompt_data['text_tokens']
|
295 |
lang_pr = prompt_data['lang_code']
|
|
|
301 |
|
302 |
enroll_x_lens = text_prompts.shape[-1]
|
303 |
logging.info(f"synthesize text: {text}")
|
304 |
+
phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip())
|
305 |
text_tokens, text_tokens_lens = text_collater(
|
306 |
[
|
307 |
+
phone_tokens
|
308 |
]
|
309 |
)
|
310 |
text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
|
|
|
319 |
top_k=-100,
|
320 |
temperature=1,
|
321 |
prompt_language=lang_pr,
|
322 |
+
text_language=langs if accent == "no-accent" else lang,
|
323 |
)
|
324 |
samples = audio_tokenizer.decode(
|
325 |
[(encoded_frames.transpose(2, 1), None)]
|
326 |
)
|
|
|
|
|
327 |
model.to('cpu')
|
328 |
torch.cuda.empty_cache()
|
329 |
|
|
|
331 |
return message, (24000, samples[0][0].cpu().numpy())
|
332 |
|
333 |
|
334 |
+
from utils.sentence_cutter import split_text_into_sentences
|
335 |
+
@torch.no_grad()
|
336 |
+
def infer_long_text(text, preset_prompt, prompt=None, language='auto', accent='no-accent'):
|
337 |
+
"""
|
338 |
+
For long audio generation, two modes are available.
|
339 |
+
fixed-prompt: This mode will keep using the same prompt the user has provided, and generate audio sentence by sentence.
|
340 |
+
sliding-window: This mode will use the last sentence as the prompt for the next sentence, but has some concern on speaker maintenance.
|
341 |
+
"""
|
342 |
+
if len(text) > 1000:
|
343 |
+
return "Rejected, Text too long (should be less than 1000 characters)", None
|
344 |
+
mode = 'fixed-prompt'
|
345 |
+
global model, audio_tokenizer, text_tokenizer, text_collater
|
346 |
+
model.to(device)
|
347 |
+
if (prompt is None or prompt == "") and preset_prompt == "":
|
348 |
+
mode = 'sliding-window' # If no prompt is given, use sliding-window mode
|
349 |
+
sentences = split_text_into_sentences(text)
|
350 |
+
# detect language
|
351 |
+
if language == "auto-detect":
|
352 |
+
language = langid.classify(text)[0]
|
353 |
+
else:
|
354 |
+
language = token2lang[langdropdown2token[language]]
|
355 |
+
|
356 |
+
# if initial prompt is given, encode it
|
357 |
+
if prompt is not None and prompt != "":
|
358 |
+
# load prompt
|
359 |
+
prompt_data = np.load(prompt.name)
|
360 |
+
audio_prompts = prompt_data['audio_tokens']
|
361 |
+
text_prompts = prompt_data['text_tokens']
|
362 |
+
lang_pr = prompt_data['lang_code']
|
363 |
+
lang_pr = code2lang[int(lang_pr)]
|
364 |
+
|
365 |
+
# numpy to tensor
|
366 |
+
audio_prompts = torch.tensor(audio_prompts).type(torch.int32).to(device)
|
367 |
+
text_prompts = torch.tensor(text_prompts).type(torch.int32)
|
368 |
+
elif preset_prompt is not None and preset_prompt != "":
|
369 |
+
prompt_data = np.load(os.path.join("./presets/", f"{preset_prompt}.npz"))
|
370 |
+
audio_prompts = prompt_data['audio_tokens']
|
371 |
+
text_prompts = prompt_data['text_tokens']
|
372 |
+
lang_pr = prompt_data['lang_code']
|
373 |
+
lang_pr = code2lang[int(lang_pr)]
|
374 |
+
|
375 |
+
# numpy to tensor
|
376 |
+
audio_prompts = torch.tensor(audio_prompts).type(torch.int32).to(device)
|
377 |
+
text_prompts = torch.tensor(text_prompts).type(torch.int32)
|
378 |
+
else:
|
379 |
+
audio_prompts = torch.zeros([1, 0, NUM_QUANTIZERS]).type(torch.int32).to(device)
|
380 |
+
text_prompts = torch.zeros([1, 0]).type(torch.int32)
|
381 |
+
lang_pr = language if language != 'mix' else 'en'
|
382 |
+
if mode == 'fixed-prompt':
|
383 |
+
complete_tokens = torch.zeros([1, NUM_QUANTIZERS, 0]).type(torch.LongTensor).to(device)
|
384 |
+
for text in sentences:
|
385 |
+
text = text.replace("\n", "").strip(" ")
|
386 |
+
if text == "":
|
387 |
+
continue
|
388 |
+
lang_token = lang2token[language]
|
389 |
+
lang = token2lang[lang_token]
|
390 |
+
text = lang_token + text + lang_token
|
391 |
+
|
392 |
+
enroll_x_lens = text_prompts.shape[-1]
|
393 |
+
logging.info(f"synthesize text: {text}")
|
394 |
+
phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip())
|
395 |
+
text_tokens, text_tokens_lens = text_collater(
|
396 |
+
[
|
397 |
+
phone_tokens
|
398 |
+
]
|
399 |
+
)
|
400 |
+
text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
|
401 |
+
text_tokens_lens += enroll_x_lens
|
402 |
+
# accent control
|
403 |
+
lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
|
404 |
+
encoded_frames = model.inference(
|
405 |
+
text_tokens.to(device),
|
406 |
+
text_tokens_lens.to(device),
|
407 |
+
audio_prompts,
|
408 |
+
enroll_x_lens=enroll_x_lens,
|
409 |
+
top_k=-100,
|
410 |
+
temperature=1,
|
411 |
+
prompt_language=lang_pr,
|
412 |
+
text_language=langs if accent == "no-accent" else lang,
|
413 |
+
)
|
414 |
+
complete_tokens = torch.cat([complete_tokens, encoded_frames.transpose(2, 1)], dim=-1)
|
415 |
+
samples = audio_tokenizer.decode(
|
416 |
+
[(complete_tokens, None)]
|
417 |
+
)
|
418 |
+
model.to('cpu')
|
419 |
+
message = f"Cut into {len(sentences)} sentences"
|
420 |
+
return message, (24000, samples[0][0].cpu().numpy())
|
421 |
+
elif mode == "sliding-window":
|
422 |
+
complete_tokens = torch.zeros([1, NUM_QUANTIZERS, 0]).type(torch.LongTensor).to(device)
|
423 |
+
original_audio_prompts = audio_prompts
|
424 |
+
original_text_prompts = text_prompts
|
425 |
+
for text in sentences:
|
426 |
+
text = text.replace("\n", "").strip(" ")
|
427 |
+
if text == "":
|
428 |
+
continue
|
429 |
+
lang_token = lang2token[language]
|
430 |
+
lang = token2lang[lang_token]
|
431 |
+
text = lang_token + text + lang_token
|
432 |
+
|
433 |
+
enroll_x_lens = text_prompts.shape[-1]
|
434 |
+
logging.info(f"synthesize text: {text}")
|
435 |
+
phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip())
|
436 |
+
text_tokens, text_tokens_lens = text_collater(
|
437 |
+
[
|
438 |
+
phone_tokens
|
439 |
+
]
|
440 |
+
)
|
441 |
+
text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
|
442 |
+
text_tokens_lens += enroll_x_lens
|
443 |
+
# accent control
|
444 |
+
lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
|
445 |
+
encoded_frames = model.inference(
|
446 |
+
text_tokens.to(device),
|
447 |
+
text_tokens_lens.to(device),
|
448 |
+
audio_prompts,
|
449 |
+
enroll_x_lens=enroll_x_lens,
|
450 |
+
top_k=-100,
|
451 |
+
temperature=1,
|
452 |
+
prompt_language=lang_pr,
|
453 |
+
text_language=langs if accent == "no-accent" else lang,
|
454 |
+
)
|
455 |
+
complete_tokens = torch.cat([complete_tokens, encoded_frames.transpose(2, 1)], dim=-1)
|
456 |
+
if torch.rand(1) < 1.0:
|
457 |
+
audio_prompts = encoded_frames[:, :, -NUM_QUANTIZERS:]
|
458 |
+
text_prompts = text_tokens[:, enroll_x_lens:]
|
459 |
+
else:
|
460 |
+
audio_prompts = original_audio_prompts
|
461 |
+
text_prompts = original_text_prompts
|
462 |
+
samples = audio_tokenizer.decode(
|
463 |
+
[(complete_tokens, None)]
|
464 |
+
)
|
465 |
+
model.to('cpu')
|
466 |
+
message = f"Cut into {len(sentences)} sentences"
|
467 |
+
return message, (24000, samples[0][0].cpu().numpy())
|
468 |
+
else:
|
469 |
+
raise ValueError(f"No such mode {mode}")
|
470 |
+
|
471 |
+
|
472 |
def main():
|
473 |
app = gr.Blocks()
|
474 |
with app:
|
|
|
480 |
|
481 |
textbox = gr.TextArea(label="Text",
|
482 |
placeholder="Type your sentence here",
|
483 |
+
value="Welcome back, Master. What can I do for you today?", elem_id=f"tts-input")
|
484 |
+
language_dropdown = gr.Dropdown(choices=['auto-detect', 'English', '中文', '日本語'], value='English', label='auto-detect')
|
485 |
accent_dropdown = gr.Dropdown(choices=['no-accent', 'English', '中文', '日本語'], value='no-accent', label='accent')
|
486 |
+
textbox_transcript = gr.TextArea(label="Transcript",
|
487 |
+
placeholder="Write transcript here. (leave empty to use whisper)",
|
488 |
+
value="", elem_id=f"prompt-name")
|
489 |
upload_audio_prompt = gr.Audio(label='uploaded audio prompt', source='upload', interactive=True)
|
490 |
record_audio_prompt = gr.Audio(label='recorded audio prompt', source='microphone', interactive=True)
|
491 |
with gr.Column():
|
|
|
493 |
audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
|
494 |
btn = gr.Button("Generate!")
|
495 |
btn.click(infer_from_audio,
|
496 |
+
inputs=[textbox, language_dropdown, accent_dropdown, upload_audio_prompt, record_audio_prompt, textbox_transcript],
|
497 |
outputs=[text_output, audio_output])
|
498 |
textbox_mp = gr.TextArea(label="Prompt name",
|
499 |
placeholder="Name your prompt here",
|
|
|
501 |
btn_mp = gr.Button("Make prompt!")
|
502 |
prompt_output = gr.File(interactive=False)
|
503 |
btn_mp.click(make_npz_prompt,
|
504 |
+
inputs=[textbox_mp, upload_audio_prompt, record_audio_prompt, textbox_transcript],
|
505 |
outputs=[text_output, prompt_output])
|
506 |
with gr.Tab("Make prompt"):
|
507 |
gr.Markdown(make_prompt_md)
|
|
|
510 |
textbox2 = gr.TextArea(label="Prompt name",
|
511 |
placeholder="Name your prompt here",
|
512 |
value="prompt_1", elem_id=f"prompt-name")
|
513 |
+
# 添加选择语言和输入台本的地方
|
514 |
+
textbox_transcript2 = gr.TextArea(label="Transcript",
|
515 |
+
placeholder="Write transcript here. (leave empty to use whisper)",
|
516 |
+
value="", elem_id=f"prompt-name")
|
517 |
upload_audio_prompt_2 = gr.Audio(label='uploaded audio prompt', source='upload', interactive=True)
|
518 |
record_audio_prompt_2 = gr.Audio(label='recorded audio prompt', source='microphone', interactive=True)
|
519 |
with gr.Column():
|
|
|
521 |
prompt_output_2 = gr.File(interactive=False)
|
522 |
btn_2 = gr.Button("Make!")
|
523 |
btn_2.click(make_npz_prompt,
|
524 |
+
inputs=[textbox2, upload_audio_prompt_2, record_audio_prompt_2, textbox_transcript2],
|
525 |
outputs=[text_output_2, prompt_output_2])
|
526 |
with gr.Tab("Infer from prompt"):
|
527 |
gr.Markdown(infer_from_prompt_md)
|
|
|
529 |
with gr.Column():
|
530 |
textbox_3 = gr.TextArea(label="Text",
|
531 |
placeholder="Type your sentence here",
|
532 |
+
value="Welcome back, Master. What can I do for you today?", elem_id=f"tts-input")
|
533 |
+
language_dropdown_3 = gr.Dropdown(choices=['auto-detect', 'English', '中文', '日本語', 'Mix'], value='auto-detect',
|
534 |
label='language')
|
535 |
accent_dropdown_3 = gr.Dropdown(choices=['no-accent', 'English', '中文', '日本語'], value='no-accent',
|
536 |
label='accent')
|
537 |
+
preset_dropdown_3 = gr.Dropdown(choices=preset_list, value=None, label='Voice preset')
|
538 |
prompt_file = gr.File(file_count='single', file_types=['.npz'], interactive=True)
|
539 |
with gr.Column():
|
540 |
text_output_3 = gr.Textbox(label="Message")
|
541 |
audio_output_3 = gr.Audio(label="Output Audio", elem_id="tts-audio")
|
542 |
btn_3 = gr.Button("Generate!")
|
543 |
btn_3.click(infer_from_prompt,
|
544 |
+
inputs=[textbox_3, language_dropdown_3, accent_dropdown_3, preset_dropdown_3, prompt_file],
|
545 |
outputs=[text_output_3, audio_output_3])
|
546 |
+
with gr.Tab("Infer long text"):
|
547 |
+
gr.Markdown("This is a long text generation demo. You can use this to generate long audio. ")
|
548 |
+
with gr.Row():
|
549 |
+
with gr.Column():
|
550 |
+
textbox_4 = gr.TextArea(label="Text",
|
551 |
+
placeholder="Type your sentence here",
|
552 |
+
value=long_text_example, elem_id=f"tts-input")
|
553 |
+
language_dropdown_4 = gr.Dropdown(choices=['auto-detect', 'English', '中文', '日本語'], value='auto-detect',
|
554 |
+
label='language')
|
555 |
+
accent_dropdown_4 = gr.Dropdown(choices=['no-accent', 'English', '中文', '日本語'], value='no-accent',
|
556 |
+
label='accent')
|
557 |
+
preset_dropdown_4 = gr.Dropdown(choices=preset_list, value=None, label='Voice preset')
|
558 |
+
prompt_file_4 = gr.File(file_count='single', file_types=['.npz'], interactive=True)
|
559 |
+
with gr.Column():
|
560 |
+
text_output_4 = gr.TextArea(label="Message")
|
561 |
+
audio_output_4 = gr.Audio(label="Output Audio", elem_id="tts-audio")
|
562 |
+
btn_4 = gr.Button("Generate!")
|
563 |
+
btn_4.click(infer_long_text,
|
564 |
+
inputs=[textbox_4, preset_dropdown_4, prompt_file_4, language_dropdown_4, accent_dropdown_4],
|
565 |
+
outputs=[text_output_4, audio_output_4])
|
566 |
|
567 |
app.launch()
|
568 |
|
descriptions.py
CHANGED
@@ -1,8 +1,5 @@
|
|
1 |
top_md = """
|
2 |
# VALL-E X
|
3 |
-
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1yyD_sz531QntLKowMHo-XxorsFBCfKul?usp=sharing)
|
4 |
-
[![GitHub](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/Plachtaa/vallex-webui)
|
5 |
-
Unofficial implementation of Microsoft's [VALL-E X](https://arxiv.org/pdf/2303.03926).<br>
|
6 |
VALL-E X can synthesize high-quality personalized speech with only a 3-second enrolled recording of
|
7 |
an unseen speaker as an acoustic prompt, even in another language for a monolingual speaker.<br>
|
8 |
This implementation supports zero-shot, mono-lingual/cross-lingual text-to-speech functionality of three languages (English, Chinese, Japanese)<br>
|
@@ -24,4 +21,6 @@ Get a `.npz` file as the encoded audio prompt. Use it by **"Infer with prompt"**
|
|
24 |
infer_from_prompt_md = """
|
25 |
Faster than **"Infer from audio"**.<br>
|
26 |
You need to **"Make prompt"** first, and upload the encoded prompt (a `.npz` file)
|
27 |
-
"""
|
|
|
|
|
|
1 |
top_md = """
|
2 |
# VALL-E X
|
|
|
|
|
|
|
3 |
VALL-E X can synthesize high-quality personalized speech with only a 3-second enrolled recording of
|
4 |
an unseen speaker as an acoustic prompt, even in another language for a monolingual speaker.<br>
|
5 |
This implementation supports zero-shot, mono-lingual/cross-lingual text-to-speech functionality of three languages (English, Chinese, Japanese)<br>
|
|
|
21 |
infer_from_prompt_md = """
|
22 |
Faster than **"Infer from audio"**.<br>
|
23 |
You need to **"Make prompt"** first, and upload the encoded prompt (a `.npz` file)
|
24 |
+
"""
|
25 |
+
|
26 |
+
long_text_example = "Just a few years ago, there were no legions of deep learning scientists developing intelligent products and services at major companies and startups. When we entered the field, machine learning did not command headlines in daily newspapers. Our parents had no idea what machine learning was, let alone why we might prefer it to a career in medicine or law. Machine learning was a blue skies academic discipline whose industrial significance was limited to a narrow set of real-world applications, including speech recognition and computer vision. Moreover, many of these applications required so much domain knowledge that they were often regarded as entirely separate areas for which machine learning was one small component. At that time, neural networks—the predecessors of the deep learning methods that we focus on in this book—were generally regarded as outmoded."
|
macros.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
NUM_LAYERS = 12
|
2 |
+
NUM_HEAD = 16
|
3 |
+
N_DIM = 1024
|
4 |
+
PREFIX_MODE = 1
|
5 |
+
NUM_QUANTIZERS = 8
|
6 |
+
SAMPLE_RATE = 24000
|
7 |
+
|
8 |
+
lang2token = {
|
9 |
+
'zh': "[ZH]",
|
10 |
+
'ja': "[JA]",
|
11 |
+
"en": "[EN]",
|
12 |
+
'mix': "",
|
13 |
+
}
|
14 |
+
|
15 |
+
lang2code = {
|
16 |
+
'zh': 0,
|
17 |
+
'ja': 1,
|
18 |
+
"en": 2,
|
19 |
+
}
|
20 |
+
|
21 |
+
token2lang = {
|
22 |
+
'[ZH]': "zh",
|
23 |
+
'[JA]': "ja",
|
24 |
+
"[EN]": "en",
|
25 |
+
"": "mix"
|
26 |
+
}
|
27 |
+
|
28 |
+
code2lang = {
|
29 |
+
0: 'zh',
|
30 |
+
1: 'ja',
|
31 |
+
2: "en",
|
32 |
+
}
|
33 |
+
|
34 |
+
langdropdown2token = {
|
35 |
+
'English': "[EN]",
|
36 |
+
'中文': "[ZH]",
|
37 |
+
'日本語': "[JA]",
|
38 |
+
'Mix': "",
|
39 |
+
}
|
presets/acou_1.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:470ce66fc24a2d14e162343381f7d93ef0a3af51edf5fd37240c21f492b4e769
|
3 |
+
size 15650
|
presets/acou_2.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ec1c5328751cadeed5356d4264759799ad96d33ea8dd4f8a3d0a80dd8ddb0e74
|
3 |
+
size 15426
|
presets/acou_3.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:03f241b094a32b3f542e74374183c6d15e8b70ae73ceeafb11bfd4ee6b8b4a3a
|
3 |
+
size 15410
|
presets/acou_4.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:52b96f32863f13f84cf7ac4a27d2bc95cea70c350a037f4d1890b20b8da9501e
|
3 |
+
size 15506
|
presets/amused.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:df3e882f3a62805b9aaf300d81822cd4eddeafee480503b7b78e32be2085fb11
|
3 |
+
size 20882
|
presets/anger.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:959cec6dc0b30219db0d70cdd165fe00bbdc098165cf9d67ccdd1ecf7a5da5be
|
3 |
+
size 22090
|
presets/babara.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8106b2a98c3f70587f23ab46ed5bf73b1c9a770481c3620ab140bd3256010376
|
3 |
+
size 11526
|
presets/bronya_1.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:02eaada2c3d58866c813887ed9f871587ef5a7e976abc23382ce46a17b208001
|
3 |
+
size 18106
|
presets/dingzhen.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4d19167c65eefef5e42dfaa1919ff5149ca0a93cb052396a47d1f42f9865f5f8
|
3 |
+
size 18154
|
presets/disgust.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4443f0a395072700f2ec6101dbf2ad9d28968aa3e5809e384ea131832f894d7f
|
3 |
+
size 39386
|
presets/emo_amused.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:38be2ea16dc79beae68b6c885d99d4dad516acbd88ed5ed6991dd97301f2f30b
|
3 |
+
size 15378
|
presets/emo_anger.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3261c3bdd5b7b4be9783d9293ee3d871be9d9d791f2b3a8bf62a1a0ee0ed93e6
|
3 |
+
size 15434
|
presets/emo_neutral.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2188c4154692316ed7c0edee3aa3dd8678be36f355ee2b8c8a3a6412c3673ba9
|
3 |
+
size 15578
|
presets/emo_sleepy.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2a53255890beaf4ed339e1967f0837fdb87c34c9f7e18bf77cd4b08eba176963
|
3 |
+
size 15370
|
presets/en2zh_tts_1.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5d4de4ed055448ea54f7b40091afae565197f960d954279035ac537ea5a01bc4
|
3 |
+
size 44354
|
presets/en2zh_tts_2.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dcc066ea104daa27d1552fe76574d09359d56fa892241581cc19e931a696eca9
|
3 |
+
size 24178
|
presets/en2zh_tts_3.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7468944e6d0ed7f2da033e8037be07dbafc76bd1ed7c0f5996d85ff45aacda11
|
3 |
+
size 21410
|
presets/en2zh_tts_4.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0fd8d0914e74769114310e9504d68d6b7b0c6aacd46763478cbfd4f9631ad54a
|
3 |
+
size 43826
|
presets/fuxuan_2.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:17b90388d179ae309e1f577c28c3f10d9bed73c6ccbffdd829c00568eb3941e6
|
3 |
+
size 50330
|
presets/librispeech_1.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:415b244e43b45291fd651d71f15bb7a31c244e2054988c436f6bbc04465c6099
|
3 |
+
size 15650
|
presets/librispeech_2.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bd74e77370248b025321b9dbae25b1572f13f98da63255e384d382d2b0c78227
|
3 |
+
size 15418
|
presets/librispeech_3.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1eceb3f4cc0f3a8856b5e3b5f1ca28c428d75305b1452da1ecf4013bc358ccaa
|
3 |
+
size 15634
|
presets/librispeech_4.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3939dde39f5e65bc01f5eba9acb7b8329465aaca3c38edf1b240aa714e687960
|
3 |
+
size 15594
|
presets/neutral.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a8a63993526ffdc788a711b512d07a8b1c816151a1edb63913d0bfb48c2ea380
|
3 |
+
size 21050
|
presets/paimon_1.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:452d5e0cd3a060db521bd65a16af818a6177f357801402aa5581eceb2c24039a
|
3 |
+
size 13762
|
presets/rosalia.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:af87ebe283bbb7b527c6c0ff0a02a315416485677fe23330040c2766fa9af919
|
3 |
+
size 11414
|
presets/seel.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:44ad2e900df3625f9753e949dc5a7d8479c4091e24cb18cbf46e34e29498d952
|
3 |
+
size 13554
|
presets/sleepiness.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e0f866a278a10c7b6b494fb62589a9d8fef778ccf272df3b0d5510f45b243b5c
|
3 |
+
size 33218
|
presets/vctk_1.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1c9df2ea8c2bc919c0ac50f8e05950bb4e831de69b33a7fb12d584da5b2512f2
|
3 |
+
size 15530
|
presets/vctk_2.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cc84744435a304b3e700b8b1ab94c3b891db3056bd55a0f9dd99eff284016efa
|
3 |
+
size 15458
|
presets/vctk_3.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ec0d528c6ae9c8f32b02ca6b57aa565b9fe63f401fd04f2632ed7e536699b9ac
|
3 |
+
size 15450
|
presets/vctk_4.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8ff2b71254ae00be6e42ad206c7616d168bd41582837e9eeb4d6cd669bd0b140
|
3 |
+
size 15330
|
presets/yaesakura.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b388a18d286b4ba13d45bae373a716c0010dc40ae9c940d53b5a04cbc64e95ff
|
3 |
+
size 12442
|
presets/zh2en_tts_1.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:07bff150ad145f9b06f0e7cbf9b0ee4d9e926600efa0d129bd831c8b2993c2b0
|
3 |
+
size 23546
|
presets/zh2en_tts_2.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0257d0782578c7813c3f43b5e93c0e681f9ea42fe76775d5a4f4fea64609b03e
|
3 |
+
size 20170
|
presets/zh2en_tts_3.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5da48e060d15f391767bffe1d528bfbc782a562413feed2e9bd2cafa82bf644a
|
3 |
+
size 17906
|
presets/zh2en_tts_4.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bda7a70ed9b03d8f1ff99d2444ea1df476a8deaf75633aa3b3f6cf3f45ae7e5e
|
3 |
+
size 33682
|
requirements.txt
CHANGED
@@ -1,8 +1,10 @@
|
|
|
|
1 |
numpy
|
2 |
-
torch
|
3 |
-
|
4 |
tokenizers
|
5 |
encodec
|
|
|
6 |
unidecode
|
7 |
pyopenjtalk
|
8 |
pypinyin
|
@@ -11,4 +13,8 @@ cn2an
|
|
11 |
jieba
|
12 |
eng_to_ipa
|
13 |
jieba
|
14 |
-
|
|
|
|
|
|
|
|
|
|
1 |
+
soundfile
|
2 |
numpy
|
3 |
+
torch==2.0.1
|
4 |
+
torchvision==0.15.2
|
5 |
tokenizers
|
6 |
encodec
|
7 |
+
langid
|
8 |
unidecode
|
9 |
pyopenjtalk
|
10 |
pypinyin
|
|
|
13 |
jieba
|
14 |
eng_to_ipa
|
15 |
jieba
|
16 |
+
SudachiPy
|
17 |
+
openai-whisper
|
18 |
+
phonemizer
|
19 |
+
matplotlib
|
20 |
+
gradio
|
utils/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (564 Bytes). View file
|
|
utils/g2p/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (3.02 kB). View file
|
|
utils/g2p/__pycache__/cleaners.cpython-38.pyc
ADDED
Binary file (1.95 kB). View file
|
|
utils/g2p/__pycache__/english.cpython-38.pyc
ADDED
Binary file (4.85 kB). View file
|
|
utils/g2p/__pycache__/japanese.cpython-38.pyc
ADDED
Binary file (4.44 kB). View file
|
|
utils/g2p/__pycache__/mandarin.cpython-38.pyc
ADDED
Binary file (6.37 kB). View file
|
|
utils/g2p/__pycache__/symbols.cpython-38.pyc
ADDED
Binary file (434 Bytes). View file
|
|
utils/generation.py
ADDED
@@ -0,0 +1,256 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
import gdown
|
4 |
+
import logging
|
5 |
+
import langid
|
6 |
+
langid.set_languages(['en', 'zh', 'ja'])
|
7 |
+
|
8 |
+
import pathlib
|
9 |
+
import platform
|
10 |
+
if platform.system().lower() == 'windows':
|
11 |
+
temp = pathlib.PosixPath
|
12 |
+
pathlib.PosixPath = pathlib.WindowsPath
|
13 |
+
elif platform.system().lower() == 'linux':
|
14 |
+
temp = pathlib.WindowsPath
|
15 |
+
pathlib.WindowsPath = pathlib.PosixPath
|
16 |
+
|
17 |
+
import numpy as np
|
18 |
+
from data.tokenizer import (
|
19 |
+
AudioTokenizer,
|
20 |
+
tokenize_audio,
|
21 |
+
)
|
22 |
+
from data.collation import get_text_token_collater
|
23 |
+
from models.vallex import VALLE
|
24 |
+
from utils.g2p import PhonemeBpeTokenizer
|
25 |
+
from utils.sentence_cutter import split_text_into_sentences
|
26 |
+
|
27 |
+
from macros import *
|
28 |
+
|
29 |
+
device = torch.device("cpu")
|
30 |
+
if torch.cuda.is_available():
|
31 |
+
device = torch.device("cuda", 0)
|
32 |
+
|
33 |
+
url = 'https://drive.google.com/file/d/10gdQWvP-K_e1undkvv0p2b7SU6I4Egyl/view?usp=sharing'
|
34 |
+
|
35 |
+
checkpoints_dir = "./checkpoints/"
|
36 |
+
|
37 |
+
model_checkpoint_name = "vallex-checkpoint.pt"
|
38 |
+
|
39 |
+
model = None
|
40 |
+
|
41 |
+
codec = None
|
42 |
+
|
43 |
+
text_tokenizer = PhonemeBpeTokenizer(tokenizer_path="./utils/g2p/bpe_69.json")
|
44 |
+
text_collater = get_text_token_collater()
|
45 |
+
|
46 |
+
def preload_models():
|
47 |
+
global model, codec
|
48 |
+
if not os.path.exists(checkpoints_dir): os.mkdir(checkpoints_dir)
|
49 |
+
if not os.path.exists(os.path.join(checkpoints_dir, model_checkpoint_name)):
|
50 |
+
gdown.download(id="10gdQWvP-K_e1undkvv0p2b7SU6I4Egyl", output=os.path.join(checkpoints_dir, model_checkpoint_name), quiet=False)
|
51 |
+
# VALL-E
|
52 |
+
model = VALLE(
|
53 |
+
N_DIM,
|
54 |
+
NUM_HEAD,
|
55 |
+
NUM_LAYERS,
|
56 |
+
norm_first=True,
|
57 |
+
add_prenet=False,
|
58 |
+
prefix_mode=PREFIX_MODE,
|
59 |
+
share_embedding=True,
|
60 |
+
nar_scale_factor=1.0,
|
61 |
+
prepend_bos=True,
|
62 |
+
num_quantizers=NUM_QUANTIZERS,
|
63 |
+
).to(device)
|
64 |
+
checkpoint = torch.load(os.path.join(checkpoints_dir, model_checkpoint_name), map_location='cpu')
|
65 |
+
missing_keys, unexpected_keys = model.load_state_dict(
|
66 |
+
checkpoint["model"], strict=True
|
67 |
+
)
|
68 |
+
assert not missing_keys
|
69 |
+
model.eval()
|
70 |
+
|
71 |
+
# Encodec
|
72 |
+
codec = AudioTokenizer(device)
|
73 |
+
|
74 |
+
@torch.no_grad()
|
75 |
+
def generate_audio(text, prompt=None, language='auto', accent='no-accent'):
|
76 |
+
global model, codec, text_tokenizer, text_collater
|
77 |
+
text = text.replace("\n", "").strip(" ")
|
78 |
+
# detect language
|
79 |
+
if language == "auto":
|
80 |
+
language = langid.classify(text)[0]
|
81 |
+
lang_token = lang2token[language]
|
82 |
+
lang = token2lang[lang_token]
|
83 |
+
text = lang_token + text + lang_token
|
84 |
+
|
85 |
+
# load prompt
|
86 |
+
if prompt is not None:
|
87 |
+
prompt_path = prompt
|
88 |
+
if not os.path.exists(prompt_path):
|
89 |
+
prompt_path = "./presets/" + prompt + ".npz"
|
90 |
+
if not os.path.exists(prompt_path):
|
91 |
+
prompt_path = "./customs/" + prompt + ".npz"
|
92 |
+
if not os.path.exists(prompt_path):
|
93 |
+
raise ValueError(f"Cannot find prompt {prompt}")
|
94 |
+
prompt_data = np.load(prompt_path)
|
95 |
+
audio_prompts = prompt_data['audio_tokens']
|
96 |
+
text_prompts = prompt_data['text_tokens']
|
97 |
+
lang_pr = prompt_data['lang_code']
|
98 |
+
lang_pr = code2lang[int(lang_pr)]
|
99 |
+
|
100 |
+
# numpy to tensor
|
101 |
+
audio_prompts = torch.tensor(audio_prompts).type(torch.int32).to(device)
|
102 |
+
text_prompts = torch.tensor(text_prompts).type(torch.int32)
|
103 |
+
else:
|
104 |
+
audio_prompts = torch.zeros([1, 0, NUM_QUANTIZERS]).type(torch.int32).to(device)
|
105 |
+
text_prompts = torch.zeros([1, 0]).type(torch.int32)
|
106 |
+
lang_pr = lang if lang != 'mix' else 'en'
|
107 |
+
|
108 |
+
enroll_x_lens = text_prompts.shape[-1]
|
109 |
+
logging.info(f"synthesize text: {text}")
|
110 |
+
phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip())
|
111 |
+
text_tokens, text_tokens_lens = text_collater(
|
112 |
+
[
|
113 |
+
phone_tokens
|
114 |
+
]
|
115 |
+
)
|
116 |
+
text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
|
117 |
+
text_tokens_lens += enroll_x_lens
|
118 |
+
# accent control
|
119 |
+
lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
|
120 |
+
encoded_frames = model.inference(
|
121 |
+
text_tokens.to(device),
|
122 |
+
text_tokens_lens.to(device),
|
123 |
+
audio_prompts,
|
124 |
+
enroll_x_lens=enroll_x_lens,
|
125 |
+
top_k=-100,
|
126 |
+
temperature=1,
|
127 |
+
prompt_language=lang_pr,
|
128 |
+
text_language=langs if accent == "no-accent" else lang,
|
129 |
+
)
|
130 |
+
samples = codec.decode(
|
131 |
+
[(encoded_frames.transpose(2, 1), None)]
|
132 |
+
)
|
133 |
+
|
134 |
+
return samples[0][0].cpu().numpy()
|
135 |
+
|
136 |
+
@torch.no_grad()
|
137 |
+
def generate_audio_from_long_text(text, prompt=None, language='auto', accent='no-accent', mode='sliding-window'):
|
138 |
+
"""
|
139 |
+
For long audio generation, two modes are available.
|
140 |
+
fixed-prompt: This mode will keep using the same prompt the user has provided, and generate audio sentence by sentence.
|
141 |
+
sliding-window: This mode will use the last sentence as the prompt for the next sentence, but has some concern on speaker maintenance.
|
142 |
+
"""
|
143 |
+
global model, codec, text_tokenizer, text_collater
|
144 |
+
if prompt is None or prompt == "":
|
145 |
+
mode = 'sliding-window' # If no prompt is given, use sliding-window mode
|
146 |
+
sentences = split_text_into_sentences(text)
|
147 |
+
# detect language
|
148 |
+
if language == "auto":
|
149 |
+
language = langid.classify(text)[0]
|
150 |
+
|
151 |
+
# if initial prompt is given, encode it
|
152 |
+
if prompt is not None and prompt != "":
|
153 |
+
prompt_path = prompt
|
154 |
+
if not os.path.exists(prompt_path):
|
155 |
+
prompt_path = "./presets/" + prompt + ".npz"
|
156 |
+
if not os.path.exists(prompt_path):
|
157 |
+
prompt_path = "./customs/" + prompt + ".npz"
|
158 |
+
if not os.path.exists(prompt_path):
|
159 |
+
raise ValueError(f"Cannot find prompt {prompt}")
|
160 |
+
prompt_data = np.load(prompt_path)
|
161 |
+
audio_prompts = prompt_data['audio_tokens']
|
162 |
+
text_prompts = prompt_data['text_tokens']
|
163 |
+
lang_pr = prompt_data['lang_code']
|
164 |
+
lang_pr = code2lang[int(lang_pr)]
|
165 |
+
|
166 |
+
# numpy to tensor
|
167 |
+
audio_prompts = torch.tensor(audio_prompts).type(torch.int32).to(device)
|
168 |
+
text_prompts = torch.tensor(text_prompts).type(torch.int32)
|
169 |
+
else:
|
170 |
+
audio_prompts = torch.zeros([1, 0, NUM_QUANTIZERS]).type(torch.int32).to(device)
|
171 |
+
text_prompts = torch.zeros([1, 0]).type(torch.int32)
|
172 |
+
lang_pr = language if language != 'mix' else 'en'
|
173 |
+
if mode == 'fixed-prompt':
|
174 |
+
complete_tokens = torch.zeros([1, NUM_QUANTIZERS, 0]).type(torch.LongTensor).to(device)
|
175 |
+
for text in sentences:
|
176 |
+
text = text.replace("\n", "").strip(" ")
|
177 |
+
if text == "":
|
178 |
+
continue
|
179 |
+
lang_token = lang2token[language]
|
180 |
+
lang = token2lang[lang_token]
|
181 |
+
text = lang_token + text + lang_token
|
182 |
+
|
183 |
+
enroll_x_lens = text_prompts.shape[-1]
|
184 |
+
logging.info(f"synthesize text: {text}")
|
185 |
+
phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip())
|
186 |
+
text_tokens, text_tokens_lens = text_collater(
|
187 |
+
[
|
188 |
+
phone_tokens
|
189 |
+
]
|
190 |
+
)
|
191 |
+
text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
|
192 |
+
text_tokens_lens += enroll_x_lens
|
193 |
+
# accent control
|
194 |
+
lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
|
195 |
+
encoded_frames = model.inference(
|
196 |
+
text_tokens.to(device),
|
197 |
+
text_tokens_lens.to(device),
|
198 |
+
audio_prompts,
|
199 |
+
enroll_x_lens=enroll_x_lens,
|
200 |
+
top_k=-100,
|
201 |
+
temperature=1,
|
202 |
+
prompt_language=lang_pr,
|
203 |
+
text_language=langs if accent == "no-accent" else lang,
|
204 |
+
)
|
205 |
+
complete_tokens = torch.cat([complete_tokens, encoded_frames.transpose(2, 1)], dim=-1)
|
206 |
+
samples = codec.decode(
|
207 |
+
[(complete_tokens, None)]
|
208 |
+
)
|
209 |
+
return samples[0][0].cpu().numpy()
|
210 |
+
elif mode == "sliding-window":
|
211 |
+
complete_tokens = torch.zeros([1, NUM_QUANTIZERS, 0]).type(torch.LongTensor).to(device)
|
212 |
+
original_audio_prompts = audio_prompts
|
213 |
+
original_text_prompts = text_prompts
|
214 |
+
for text in sentences:
|
215 |
+
text = text.replace("\n", "").strip(" ")
|
216 |
+
if text == "":
|
217 |
+
continue
|
218 |
+
lang_token = lang2token[language]
|
219 |
+
lang = token2lang[lang_token]
|
220 |
+
text = lang_token + text + lang_token
|
221 |
+
|
222 |
+
enroll_x_lens = text_prompts.shape[-1]
|
223 |
+
logging.info(f"synthesize text: {text}")
|
224 |
+
phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip())
|
225 |
+
text_tokens, text_tokens_lens = text_collater(
|
226 |
+
[
|
227 |
+
phone_tokens
|
228 |
+
]
|
229 |
+
)
|
230 |
+
text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
|
231 |
+
text_tokens_lens += enroll_x_lens
|
232 |
+
# accent control
|
233 |
+
lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
|
234 |
+
encoded_frames = model.inference(
|
235 |
+
text_tokens.to(device),
|
236 |
+
text_tokens_lens.to(device),
|
237 |
+
audio_prompts,
|
238 |
+
enroll_x_lens=enroll_x_lens,
|
239 |
+
top_k=-100,
|
240 |
+
temperature=1,
|
241 |
+
prompt_language=lang_pr,
|
242 |
+
text_language=langs if accent == "no-accent" else lang,
|
243 |
+
)
|
244 |
+
complete_tokens = torch.cat([complete_tokens, encoded_frames.transpose(2, 1)], dim=-1)
|
245 |
+
if torch.rand(1) < 0.5:
|
246 |
+
audio_prompts = encoded_frames[:, :, -NUM_QUANTIZERS:]
|
247 |
+
text_prompts = text_tokens[:, enroll_x_lens:]
|
248 |
+
else:
|
249 |
+
audio_prompts = original_audio_prompts
|
250 |
+
text_prompts = original_text_prompts
|
251 |
+
samples = codec.decode(
|
252 |
+
[(complete_tokens, None)]
|
253 |
+
)
|
254 |
+
return samples[0][0].cpu().numpy()
|
255 |
+
else:
|
256 |
+
raise ValueError(f"No such mode {mode}")
|
utils/prompt_making.py
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
import torchaudio
|
4 |
+
import logging
|
5 |
+
import langid
|
6 |
+
import whisper
|
7 |
+
langid.set_languages(['en', 'zh', 'ja'])
|
8 |
+
|
9 |
+
import numpy as np
|
10 |
+
from data.tokenizer import (
|
11 |
+
AudioTokenizer,
|
12 |
+
tokenize_audio,
|
13 |
+
)
|
14 |
+
from data.collation import get_text_token_collater
|
15 |
+
from utils.g2p import PhonemeBpeTokenizer
|
16 |
+
|
17 |
+
from macros import *
|
18 |
+
|
19 |
+
text_tokenizer = PhonemeBpeTokenizer(tokenizer_path="./utils/g2p/bpe_69.json")
|
20 |
+
text_collater = get_text_token_collater()
|
21 |
+
|
22 |
+
device = torch.device("cpu")
|
23 |
+
if torch.cuda.is_available():
|
24 |
+
device = torch.device("cuda", 0)
|
25 |
+
|
26 |
+
codec = AudioTokenizer(device)
|
27 |
+
|
28 |
+
whisper_model = None
|
29 |
+
|
30 |
+
@torch.no_grad()
|
31 |
+
def transcribe_one(model, audio_path):
|
32 |
+
# load audio and pad/trim it to fit 30 seconds
|
33 |
+
audio = whisper.load_audio(audio_path)
|
34 |
+
audio = whisper.pad_or_trim(audio)
|
35 |
+
|
36 |
+
# make log-Mel spectrogram and move to the same device as the model
|
37 |
+
mel = whisper.log_mel_spectrogram(audio).to(model.device)
|
38 |
+
|
39 |
+
# detect the spoken language
|
40 |
+
_, probs = model.detect_language(mel)
|
41 |
+
print(f"Detected language: {max(probs, key=probs.get)}")
|
42 |
+
lang = max(probs, key=probs.get)
|
43 |
+
# decode the audio
|
44 |
+
options = whisper.DecodingOptions(temperature=1.0, best_of=5, fp16=False if device == torch.device("cpu") else True, sample_len=150)
|
45 |
+
result = whisper.decode(model, mel, options)
|
46 |
+
|
47 |
+
# print the recognized text
|
48 |
+
print(result.text)
|
49 |
+
|
50 |
+
text_pr = result.text
|
51 |
+
if text_pr.strip(" ")[-1] not in "?!.,。,?!。、":
|
52 |
+
text_pr += "."
|
53 |
+
return lang, text_pr
|
54 |
+
|
55 |
+
def make_prompt(name, audio_prompt_path, transcript=None):
|
56 |
+
global model, text_collater, text_tokenizer, codec
|
57 |
+
wav_pr, sr = torchaudio.load(audio_prompt_path)
|
58 |
+
# check length
|
59 |
+
if wav_pr.size(-1) / sr > 15:
|
60 |
+
raise ValueError(f"Prompt too long, expect length below 15 seconds, got {wav_pr / sr} seconds.")
|
61 |
+
if wav_pr.size(0) == 2:
|
62 |
+
wav_pr = wav_pr.mean(0, keepdim=True)
|
63 |
+
text_pr, lang_pr = make_transcript(name, wav_pr, sr, transcript)
|
64 |
+
|
65 |
+
# tokenize audio
|
66 |
+
encoded_frames = tokenize_audio(codec, (wav_pr, sr))
|
67 |
+
audio_tokens = encoded_frames[0][0].transpose(2, 1).cpu().numpy()
|
68 |
+
|
69 |
+
# tokenize text
|
70 |
+
phonemes, langs = text_tokenizer.tokenize(text=f"{text_pr}".strip())
|
71 |
+
text_tokens, enroll_x_lens = text_collater(
|
72 |
+
[
|
73 |
+
phonemes
|
74 |
+
]
|
75 |
+
)
|
76 |
+
|
77 |
+
message = f"Detected language: {lang_pr}\n Detected text {text_pr}\n"
|
78 |
+
|
79 |
+
# save as npz file
|
80 |
+
save_path = os.path.join("./customs/", f"{name}.npz")
|
81 |
+
np.savez(save_path, audio_tokens=audio_tokens, text_tokens=text_tokens, lang_code=lang2code[lang_pr])
|
82 |
+
logging.info(f"Successful. Prompt saved to {save_path}")
|
83 |
+
|
84 |
+
|
85 |
+
def make_transcript(name, wav, sr, transcript=None):
|
86 |
+
|
87 |
+
if not isinstance(wav, torch.FloatTensor):
|
88 |
+
wav = torch.tensor(wav)
|
89 |
+
if wav.abs().max() > 1:
|
90 |
+
wav /= wav.abs().max()
|
91 |
+
if wav.size(-1) == 2:
|
92 |
+
wav = wav.mean(-1, keepdim=False)
|
93 |
+
if wav.ndim == 1:
|
94 |
+
wav = wav.unsqueeze(0)
|
95 |
+
assert wav.ndim and wav.size(0) == 1
|
96 |
+
if transcript is None or transcript == "":
|
97 |
+
logging.info("Transcript not given, using Whisper...")
|
98 |
+
global whisper_model
|
99 |
+
if whisper_model is None:
|
100 |
+
whisper_model = whisper.load_model("medium")
|
101 |
+
whisper_model.to(device)
|
102 |
+
torchaudio.save(f"./prompts/{name}.wav", wav, sr)
|
103 |
+
lang, text = transcribe_one(whisper_model, f"./prompts/{name}.wav")
|
104 |
+
lang_token = lang2token[lang]
|
105 |
+
text = lang_token + text + lang_token
|
106 |
+
os.remove(f"./prompts/{name}.wav")
|
107 |
+
whisper_model.cpu()
|
108 |
+
else:
|
109 |
+
text = transcript
|
110 |
+
lang, _ = langid.classify(text)
|
111 |
+
lang_token = lang2token[lang]
|
112 |
+
text = lang_token + text + lang_token
|
113 |
+
|
114 |
+
torch.cuda.empty_cache()
|
115 |
+
return text, lang
|