diff --git a/app.py b/app.py
index cedd3b7cc1004f70008dec8a7cce7f263e8b6e3b..043a1a02df9c671e1d2ed871c975a037adf03d58 100644
--- a/app.py
+++ b/app.py
@@ -4,10 +4,18 @@ import os
import pathlib
import time
import tempfile
-from pathlib import Path
-temp = pathlib.WindowsPath
-pathlib.WindowsPath = pathlib.PosixPath
+import platform
+if platform.system().lower() == 'windows':
+ temp = pathlib.PosixPath
+ pathlib.PosixPath = pathlib.WindowsPath
+elif platform.system().lower() == 'linux':
+ temp = pathlib.WindowsPath
+ pathlib.WindowsPath = pathlib.PosixPath
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
+
+import langid
+langid.set_languages(['en', 'zh', 'ja'])
+
import torch
import torchaudio
import random
@@ -22,48 +30,21 @@ from data.collation import get_text_token_collater
from models.vallex import VALLE
from utils.g2p import PhonemeBpeTokenizer
from descriptions import *
+from macros import *
import gradio as gr
import whisper
-torch.set_num_threads(1)
-torch.set_num_interop_threads(1)
-torch._C._jit_set_profiling_executor(False)
-torch._C._jit_set_profiling_mode(False)
-torch._C._set_graph_executor_optimize(False)
-# torch.manual_seed(42)
-
-lang2token = {
- 'zh': "[ZH]",
- 'ja': "[JA]",
- "en": "[EN]",
-}
-
-lang2code = {
- 'zh': 0,
- 'ja': 1,
- "en": 2,
-}
-
-token2lang = {
- '[ZH]': "zh",
- '[JA]': "ja",
- "[EN]": "en",
-}
-
-code2lang = {
- 0: 'zh',
- 1: 'ja',
- 2: "en",
-}
+import multiprocessing
+thread_count = multiprocessing.cpu_count()
+print("Use",thread_count,"cpu cores for computing")
-langdropdown2token = {
- 'English': "[EN]",
- '中文': "[ZH]",
- '日本語': "[JA]",
- 'mix': "",
-}
+torch.set_num_threads(thread_count)
+torch.set_num_interop_threads(thread_count)
+torch._C._jit_set_profiling_executor(False)
+torch._C._jit_set_profiling_mode(False)
+torch._C._set_graph_executor_optimize(False)
text_tokenizer = PhonemeBpeTokenizer(tokenizer_path="./utils/g2p/bpe_69.json")
text_collater = get_text_token_collater()
@@ -74,30 +55,33 @@ if torch.cuda.is_available():
# VALL-E-X model
model = VALLE(
- 1024,
- 16,
- 12,
- norm_first=True,
- add_prenet=False,
- prefix_mode=1,
- share_embedding=True,
- nar_scale_factor=1.0,
- prepend_bos=True,
- num_quantizers=8,
-)
+ N_DIM,
+ NUM_HEAD,
+ NUM_LAYERS,
+ norm_first=True,
+ add_prenet=False,
+ prefix_mode=PREFIX_MODE,
+ share_embedding=True,
+ nar_scale_factor=1.0,
+ prepend_bos=True,
+ num_quantizers=NUM_QUANTIZERS,
+ )
checkpoint = torch.load("./epoch-10.pt", map_location='cpu')
missing_keys, unexpected_keys = model.load_state_dict(
checkpoint["model"], strict=True
)
assert not missing_keys
-model.to('cpu')
model.eval()
# Encodec model
audio_tokenizer = AudioTokenizer(device)
# ASR
-whisper_model = whisper.load_model("medium")
+whisper_model = whisper.load_model("medium").cpu()
+
+# Voice Presets
+preset_list = os.walk("./presets/").__next__()[2]
+preset_list = [preset[:-4] for preset in preset_list if preset.endswith(".npz")]
def clear_prompts():
try:
@@ -136,24 +120,38 @@ def transcribe_one(model, audio_path):
text_pr += "."
return lang, text_pr
-def make_npz_prompt(name, uploaded_audio, recorded_audio):
+def make_npz_prompt(name, uploaded_audio, recorded_audio, transcript_content):
global model, text_collater, text_tokenizer, audio_tokenizer
clear_prompts()
audio_prompt = uploaded_audio if uploaded_audio is not None else recorded_audio
sr, wav_pr = audio_prompt
- wav_pr = torch.FloatTensor(wav_pr) / 32768
+ if len(wav_pr) / sr > 15:
+ return "Rejected, Audio too long (should be less than 15 seconds)", None
+ if not isinstance(wav_pr, torch.FloatTensor):
+ wav_pr = torch.FloatTensor(wav_pr)
+ if wav_pr.abs().max() > 1:
+ wav_pr /= wav_pr.abs().max()
if wav_pr.size(-1) == 2:
wav_pr = wav_pr.mean(-1, keepdim=False)
- text_pr, lang_pr = make_prompt(name, wav_pr, sr, save=False)
-
+ if wav_pr.ndim == 1:
+ wav_pr = wav_pr.unsqueeze(0)
+ assert wav_pr.ndim and wav_pr.size(0) == 1
+
+ if transcript_content == "":
+ text_pr, lang_pr = make_prompt(name, wav_pr, sr, save=False)
+ else:
+ lang_pr = langid.classify(str(transcript_content))[0]
+ lang_token = lang2token[lang_pr]
+ text_pr = f"{lang_token}{str(transcript_content)}{lang_token}"
# tokenize audio
- encoded_frames = tokenize_audio(audio_tokenizer, (wav_pr.unsqueeze(0), sr))
+ encoded_frames = tokenize_audio(audio_tokenizer, (wav_pr, sr))
audio_tokens = encoded_frames[0][0].transpose(2, 1).cpu().numpy()
# tokenize text
+ phonemes, _ = text_tokenizer.tokenize(text=f"{text_pr}".strip())
text_tokens, enroll_x_lens = text_collater(
[
- text_tokenizer.tokenize(text=f"{text_pr}".strip())
+ phonemes
]
)
@@ -166,8 +164,8 @@ def make_npz_prompt(name, uploaded_audio, recorded_audio):
def make_prompt(name, wav, sr, save=True):
-
global whisper_model
+ whisper_model.to(device)
if not isinstance(wav, torch.FloatTensor):
wav = torch.tensor(wav)
if wav.abs().max() > 1:
@@ -187,19 +185,41 @@ def make_prompt(name, wav, sr, save=True):
os.remove(f"./prompts/{name}.wav")
os.remove(f"./prompts/{name}.txt")
+ whisper_model.cpu()
torch.cuda.empty_cache()
return text, lang
@torch.no_grad()
-def infer_from_audio(text, language, accent, audio_prompt, record_audio_prompt):
+def infer_from_audio(text, language, accent, audio_prompt, record_audio_prompt, transcript_content):
+ if len(text) > 150:
+ return "Rejected, Text too long (should be less than 150 characters)", None
global model, text_collater, text_tokenizer, audio_tokenizer
+ model.to(device)
audio_prompt = audio_prompt if audio_prompt is not None else record_audio_prompt
sr, wav_pr = audio_prompt
- wav_pr = torch.FloatTensor(wav_pr)/32768
+ if len(wav_pr) / sr > 15:
+ return "Rejected, Audio too long (should be less than 15 seconds)", None
+ if not isinstance(wav_pr, torch.FloatTensor):
+ wav_pr = torch.FloatTensor(wav_pr)
+ if wav_pr.abs().max() > 1:
+ wav_pr /= wav_pr.abs().max()
if wav_pr.size(-1) == 2:
wav_pr = wav_pr.mean(-1, keepdim=False)
- text_pr, lang_pr = make_prompt(str(random.randint(0, 10000000)), wav_pr, sr, save=False)
- lang_token = langdropdown2token[language]
+ if wav_pr.ndim == 1:
+ wav_pr = wav_pr.unsqueeze(0)
+ assert wav_pr.ndim and wav_pr.size(0) == 1
+
+ if transcript_content == "":
+ text_pr, lang_pr = make_prompt('dummy', wav_pr, sr, save=False)
+ else:
+ lang_pr = langid.classify(str(transcript_content))[0]
+ lang_token = lang2token[lang_pr]
+ text_pr = f"{lang_token}{str(transcript_content)}{lang_token}"
+
+ if language == 'auto-detect':
+ lang_token = lang2token[langid.classify(text)[0]]
+ else:
+ lang_token = langdropdown2token[language]
lang = token2lang[lang_token]
text = lang_token + text + lang_token
@@ -207,24 +227,28 @@ def infer_from_audio(text, language, accent, audio_prompt, record_audio_prompt):
model.to(device)
# tokenize audio
- encoded_frames = tokenize_audio(audio_tokenizer, (wav_pr.unsqueeze(0), sr))
+ encoded_frames = tokenize_audio(audio_tokenizer, (wav_pr, sr))
audio_prompts = encoded_frames[0][0].transpose(2, 1).to(device)
# tokenize text
logging.info(f"synthesize text: {text}")
+ phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip())
text_tokens, text_tokens_lens = text_collater(
[
- text_tokenizer.tokenize(text=f"{text_pr}{text}".strip())
+ phone_tokens
]
)
enroll_x_lens = None
if text_pr:
- _, enroll_x_lens = text_collater(
+ text_prompts, _ = text_tokenizer.tokenize(text=f"{text_pr}".strip())
+ text_prompts, enroll_x_lens = text_collater(
[
- text_tokenizer.tokenize(text=f"{text_pr}".strip())
+ text_prompts
]
)
+ text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
+ text_tokens_lens += enroll_x_lens
lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
encoded_frames = model.inference(
text_tokens.to(device),
@@ -234,7 +258,7 @@ def infer_from_audio(text, language, accent, audio_prompt, record_audio_prompt):
top_k=-100,
temperature=1,
prompt_language=lang_pr,
- text_language=lang,
+ text_language=langs if accent == "no-accent" else lang,
)
samples = audio_tokenizer.decode(
[(encoded_frames.transpose(2, 1), None)]
@@ -248,17 +272,24 @@ def infer_from_audio(text, language, accent, audio_prompt, record_audio_prompt):
return message, (24000, samples[0][0].cpu().numpy())
@torch.no_grad()
-def infer_from_prompt(text, language, accent, prompt_file):
- # onload model
- model.to(device)
+def infer_from_prompt(text, language, accent, preset_prompt, prompt_file):
+ if len(text) > 150:
+ return "Rejected, Text too long (should be less than 150 characters)", None
clear_prompts()
+ model.to(device)
# text to synthesize
- lang_token = langdropdown2token[language]
+ if language == 'auto-detect':
+ lang_token = lang2token[langid.classify(text)[0]]
+ else:
+ lang_token = langdropdown2token[language]
lang = token2lang[lang_token]
text = lang_token + text + lang_token
# load prompt
- prompt_data = np.load(prompt_file.name)
+ if prompt_file is not None:
+ prompt_data = np.load(prompt_file.name)
+ else:
+ prompt_data = np.load(os.path.join("./presets/", f"{preset_prompt}.npz"))
audio_prompts = prompt_data['audio_tokens']
text_prompts = prompt_data['text_tokens']
lang_pr = prompt_data['lang_code']
@@ -270,9 +301,10 @@ def infer_from_prompt(text, language, accent, prompt_file):
enroll_x_lens = text_prompts.shape[-1]
logging.info(f"synthesize text: {text}")
+ phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip())
text_tokens, text_tokens_lens = text_collater(
[
- text_tokenizer.tokenize(text=f"_{text}".strip())
+ phone_tokens
]
)
text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
@@ -287,13 +319,11 @@ def infer_from_prompt(text, language, accent, prompt_file):
top_k=-100,
temperature=1,
prompt_language=lang_pr,
- text_language=lang,
+ text_language=langs if accent == "no-accent" else lang,
)
samples = audio_tokenizer.decode(
[(encoded_frames.transpose(2, 1), None)]
)
-
- # offload model
model.to('cpu')
torch.cuda.empty_cache()
@@ -301,6 +331,144 @@ def infer_from_prompt(text, language, accent, prompt_file):
return message, (24000, samples[0][0].cpu().numpy())
+from utils.sentence_cutter import split_text_into_sentences
+@torch.no_grad()
+def infer_long_text(text, preset_prompt, prompt=None, language='auto', accent='no-accent'):
+ """
+ For long audio generation, two modes are available.
+ fixed-prompt: This mode will keep using the same prompt the user has provided, and generate audio sentence by sentence.
+ sliding-window: This mode will use the last sentence as the prompt for the next sentence, but has some concern on speaker maintenance.
+ """
+ if len(text) > 1000:
+ return "Rejected, Text too long (should be less than 1000 characters)", None
+ mode = 'fixed-prompt'
+ global model, audio_tokenizer, text_tokenizer, text_collater
+ model.to(device)
+ if (prompt is None or prompt == "") and preset_prompt == "":
+ mode = 'sliding-window' # If no prompt is given, use sliding-window mode
+ sentences = split_text_into_sentences(text)
+ # detect language
+ if language == "auto-detect":
+ language = langid.classify(text)[0]
+ else:
+ language = token2lang[langdropdown2token[language]]
+
+ # if initial prompt is given, encode it
+ if prompt is not None and prompt != "":
+ # load prompt
+ prompt_data = np.load(prompt.name)
+ audio_prompts = prompt_data['audio_tokens']
+ text_prompts = prompt_data['text_tokens']
+ lang_pr = prompt_data['lang_code']
+ lang_pr = code2lang[int(lang_pr)]
+
+ # numpy to tensor
+ audio_prompts = torch.tensor(audio_prompts).type(torch.int32).to(device)
+ text_prompts = torch.tensor(text_prompts).type(torch.int32)
+ elif preset_prompt is not None and preset_prompt != "":
+ prompt_data = np.load(os.path.join("./presets/", f"{preset_prompt}.npz"))
+ audio_prompts = prompt_data['audio_tokens']
+ text_prompts = prompt_data['text_tokens']
+ lang_pr = prompt_data['lang_code']
+ lang_pr = code2lang[int(lang_pr)]
+
+ # numpy to tensor
+ audio_prompts = torch.tensor(audio_prompts).type(torch.int32).to(device)
+ text_prompts = torch.tensor(text_prompts).type(torch.int32)
+ else:
+ audio_prompts = torch.zeros([1, 0, NUM_QUANTIZERS]).type(torch.int32).to(device)
+ text_prompts = torch.zeros([1, 0]).type(torch.int32)
+ lang_pr = language if language != 'mix' else 'en'
+ if mode == 'fixed-prompt':
+ complete_tokens = torch.zeros([1, NUM_QUANTIZERS, 0]).type(torch.LongTensor).to(device)
+ for text in sentences:
+ text = text.replace("\n", "").strip(" ")
+ if text == "":
+ continue
+ lang_token = lang2token[language]
+ lang = token2lang[lang_token]
+ text = lang_token + text + lang_token
+
+ enroll_x_lens = text_prompts.shape[-1]
+ logging.info(f"synthesize text: {text}")
+ phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip())
+ text_tokens, text_tokens_lens = text_collater(
+ [
+ phone_tokens
+ ]
+ )
+ text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
+ text_tokens_lens += enroll_x_lens
+ # accent control
+ lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
+ encoded_frames = model.inference(
+ text_tokens.to(device),
+ text_tokens_lens.to(device),
+ audio_prompts,
+ enroll_x_lens=enroll_x_lens,
+ top_k=-100,
+ temperature=1,
+ prompt_language=lang_pr,
+ text_language=langs if accent == "no-accent" else lang,
+ )
+ complete_tokens = torch.cat([complete_tokens, encoded_frames.transpose(2, 1)], dim=-1)
+ samples = audio_tokenizer.decode(
+ [(complete_tokens, None)]
+ )
+ model.to('cpu')
+ message = f"Cut into {len(sentences)} sentences"
+ return message, (24000, samples[0][0].cpu().numpy())
+ elif mode == "sliding-window":
+ complete_tokens = torch.zeros([1, NUM_QUANTIZERS, 0]).type(torch.LongTensor).to(device)
+ original_audio_prompts = audio_prompts
+ original_text_prompts = text_prompts
+ for text in sentences:
+ text = text.replace("\n", "").strip(" ")
+ if text == "":
+ continue
+ lang_token = lang2token[language]
+ lang = token2lang[lang_token]
+ text = lang_token + text + lang_token
+
+ enroll_x_lens = text_prompts.shape[-1]
+ logging.info(f"synthesize text: {text}")
+ phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip())
+ text_tokens, text_tokens_lens = text_collater(
+ [
+ phone_tokens
+ ]
+ )
+ text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
+ text_tokens_lens += enroll_x_lens
+ # accent control
+ lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
+ encoded_frames = model.inference(
+ text_tokens.to(device),
+ text_tokens_lens.to(device),
+ audio_prompts,
+ enroll_x_lens=enroll_x_lens,
+ top_k=-100,
+ temperature=1,
+ prompt_language=lang_pr,
+ text_language=langs if accent == "no-accent" else lang,
+ )
+ complete_tokens = torch.cat([complete_tokens, encoded_frames.transpose(2, 1)], dim=-1)
+ if torch.rand(1) < 1.0:
+ audio_prompts = encoded_frames[:, :, -NUM_QUANTIZERS:]
+ text_prompts = text_tokens[:, enroll_x_lens:]
+ else:
+ audio_prompts = original_audio_prompts
+ text_prompts = original_text_prompts
+ samples = audio_tokenizer.decode(
+ [(complete_tokens, None)]
+ )
+ model.to('cpu')
+ message = f"Cut into {len(sentences)} sentences"
+ return message, (24000, samples[0][0].cpu().numpy())
+ else:
+ raise ValueError(f"No such mode {mode}")
+
+
def main():
app = gr.Blocks()
with app:
@@ -312,9 +480,12 @@ def main():
textbox = gr.TextArea(label="Text",
placeholder="Type your sentence here",
- value="VALLEX can synthesize personalized speech in another language for a monolingual speaker.", elem_id=f"tts-input")
- language_dropdown = gr.Dropdown(choices=['English', '中文', '日本語'], value='English', label='language')
+ value="Welcome back, Master. What can I do for you today?", elem_id=f"tts-input")
+ language_dropdown = gr.Dropdown(choices=['auto-detect', 'English', '中文', '日本語'], value='English', label='auto-detect')
accent_dropdown = gr.Dropdown(choices=['no-accent', 'English', '中文', '日本語'], value='no-accent', label='accent')
+ textbox_transcript = gr.TextArea(label="Transcript",
+ placeholder="Write transcript here. (leave empty to use whisper)",
+ value="", elem_id=f"prompt-name")
upload_audio_prompt = gr.Audio(label='uploaded audio prompt', source='upload', interactive=True)
record_audio_prompt = gr.Audio(label='recorded audio prompt', source='microphone', interactive=True)
with gr.Column():
@@ -322,7 +493,7 @@ def main():
audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
btn = gr.Button("Generate!")
btn.click(infer_from_audio,
- inputs=[textbox, language_dropdown, accent_dropdown, upload_audio_prompt, record_audio_prompt],
+ inputs=[textbox, language_dropdown, accent_dropdown, upload_audio_prompt, record_audio_prompt, textbox_transcript],
outputs=[text_output, audio_output])
textbox_mp = gr.TextArea(label="Prompt name",
placeholder="Name your prompt here",
@@ -330,7 +501,7 @@ def main():
btn_mp = gr.Button("Make prompt!")
prompt_output = gr.File(interactive=False)
btn_mp.click(make_npz_prompt,
- inputs=[textbox_mp, upload_audio_prompt, record_audio_prompt],
+ inputs=[textbox_mp, upload_audio_prompt, record_audio_prompt, textbox_transcript],
outputs=[text_output, prompt_output])
with gr.Tab("Make prompt"):
gr.Markdown(make_prompt_md)
@@ -339,6 +510,10 @@ def main():
textbox2 = gr.TextArea(label="Prompt name",
placeholder="Name your prompt here",
value="prompt_1", elem_id=f"prompt-name")
+ # 添加选择语言和输入台本的地方
+ textbox_transcript2 = gr.TextArea(label="Transcript",
+ placeholder="Write transcript here. (leave empty to use whisper)",
+ value="", elem_id=f"prompt-name")
upload_audio_prompt_2 = gr.Audio(label='uploaded audio prompt', source='upload', interactive=True)
record_audio_prompt_2 = gr.Audio(label='recorded audio prompt', source='microphone', interactive=True)
with gr.Column():
@@ -346,7 +521,7 @@ def main():
prompt_output_2 = gr.File(interactive=False)
btn_2 = gr.Button("Make!")
btn_2.click(make_npz_prompt,
- inputs=[textbox2, upload_audio_prompt_2, record_audio_prompt_2],
+ inputs=[textbox2, upload_audio_prompt_2, record_audio_prompt_2, textbox_transcript2],
outputs=[text_output_2, prompt_output_2])
with gr.Tab("Infer from prompt"):
gr.Markdown(infer_from_prompt_md)
@@ -354,19 +529,40 @@ def main():
with gr.Column():
textbox_3 = gr.TextArea(label="Text",
placeholder="Type your sentence here",
- value="VALLEX can synthesize personalized speech in another language for a monolingual speaker.", elem_id=f"tts-input")
- language_dropdown_3 = gr.Dropdown(choices=['English', '中文', '日本語'], value='English',
+ value="Welcome back, Master. What can I do for you today?", elem_id=f"tts-input")
+ language_dropdown_3 = gr.Dropdown(choices=['auto-detect', 'English', '中文', '日本語', 'Mix'], value='auto-detect',
label='language')
accent_dropdown_3 = gr.Dropdown(choices=['no-accent', 'English', '中文', '日本語'], value='no-accent',
label='accent')
+ preset_dropdown_3 = gr.Dropdown(choices=preset_list, value=None, label='Voice preset')
prompt_file = gr.File(file_count='single', file_types=['.npz'], interactive=True)
with gr.Column():
text_output_3 = gr.Textbox(label="Message")
audio_output_3 = gr.Audio(label="Output Audio", elem_id="tts-audio")
btn_3 = gr.Button("Generate!")
btn_3.click(infer_from_prompt,
- inputs=[textbox_3, language_dropdown_3, accent_dropdown_3, prompt_file],
+ inputs=[textbox_3, language_dropdown_3, accent_dropdown_3, preset_dropdown_3, prompt_file],
outputs=[text_output_3, audio_output_3])
+ with gr.Tab("Infer long text"):
+ gr.Markdown("This is a long text generation demo. You can use this to generate long audio. ")
+ with gr.Row():
+ with gr.Column():
+ textbox_4 = gr.TextArea(label="Text",
+ placeholder="Type your sentence here",
+ value=long_text_example, elem_id=f"tts-input")
+ language_dropdown_4 = gr.Dropdown(choices=['auto-detect', 'English', '中文', '日本語'], value='auto-detect',
+ label='language')
+ accent_dropdown_4 = gr.Dropdown(choices=['no-accent', 'English', '中文', '日本語'], value='no-accent',
+ label='accent')
+ preset_dropdown_4 = gr.Dropdown(choices=preset_list, value=None, label='Voice preset')
+ prompt_file_4 = gr.File(file_count='single', file_types=['.npz'], interactive=True)
+ with gr.Column():
+ text_output_4 = gr.TextArea(label="Message")
+ audio_output_4 = gr.Audio(label="Output Audio", elem_id="tts-audio")
+ btn_4 = gr.Button("Generate!")
+ btn_4.click(infer_long_text,
+ inputs=[textbox_4, preset_dropdown_4, prompt_file_4, language_dropdown_4, accent_dropdown_4],
+ outputs=[text_output_4, audio_output_4])
app.launch()
diff --git a/descriptions.py b/descriptions.py
index ec34debc5a48b2d58150171e55670e30b867d4dd..1d366b27cd1911b95028beea3e248d744567eb7a 100644
--- a/descriptions.py
+++ b/descriptions.py
@@ -1,8 +1,5 @@
top_md = """
# VALL-E X
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1yyD_sz531QntLKowMHo-XxorsFBCfKul?usp=sharing)
-[![GitHub](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/Plachtaa/vallex-webui)
-Unofficial implementation of Microsoft's [VALL-E X](https://arxiv.org/pdf/2303.03926).
VALL-E X can synthesize high-quality personalized speech with only a 3-second enrolled recording of
an unseen speaker as an acoustic prompt, even in another language for a monolingual speaker.
This implementation supports zero-shot, mono-lingual/cross-lingual text-to-speech functionality of three languages (English, Chinese, Japanese)
@@ -24,4 +21,6 @@ Get a `.npz` file as the encoded audio prompt. Use it by **"Infer with prompt"**
infer_from_prompt_md = """
Faster than **"Infer from audio"**.
You need to **"Make prompt"** first, and upload the encoded prompt (a `.npz` file)
-"""
\ No newline at end of file
+"""
+
+long_text_example = "Just a few years ago, there were no legions of deep learning scientists developing intelligent products and services at major companies and startups. When we entered the field, machine learning did not command headlines in daily newspapers. Our parents had no idea what machine learning was, let alone why we might prefer it to a career in medicine or law. Machine learning was a blue skies academic discipline whose industrial significance was limited to a narrow set of real-world applications, including speech recognition and computer vision. Moreover, many of these applications required so much domain knowledge that they were often regarded as entirely separate areas for which machine learning was one small component. At that time, neural networks—the predecessors of the deep learning methods that we focus on in this book—were generally regarded as outmoded."
\ No newline at end of file
diff --git a/macros.py b/macros.py
new file mode 100644
index 0000000000000000000000000000000000000000..b192fccde1a11da26cff026c9a08c8ff54915907
--- /dev/null
+++ b/macros.py
@@ -0,0 +1,39 @@
+NUM_LAYERS = 12
+NUM_HEAD = 16
+N_DIM = 1024
+PREFIX_MODE = 1
+NUM_QUANTIZERS = 8
+SAMPLE_RATE = 24000
+
+lang2token = {
+ 'zh': "[ZH]",
+ 'ja': "[JA]",
+ "en": "[EN]",
+ 'mix': "",
+}
+
+lang2code = {
+ 'zh': 0,
+ 'ja': 1,
+ "en": 2,
+}
+
+token2lang = {
+ '[ZH]': "zh",
+ '[JA]': "ja",
+ "[EN]": "en",
+ "": "mix"
+}
+
+code2lang = {
+ 0: 'zh',
+ 1: 'ja',
+ 2: "en",
+}
+
+langdropdown2token = {
+ 'English': "[EN]",
+ '中文': "[ZH]",
+ '日本語': "[JA]",
+ 'Mix': "",
+}
\ No newline at end of file
diff --git a/presets/acou_1.npz b/presets/acou_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..f6c51bd1c0a5dc6eebcf3c63c17c05d1d612f6ff
--- /dev/null
+++ b/presets/acou_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:470ce66fc24a2d14e162343381f7d93ef0a3af51edf5fd37240c21f492b4e769
+size 15650
diff --git a/presets/acou_2.npz b/presets/acou_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1e055e2639e010f57e74d11cd37d134f8d5ee05e
--- /dev/null
+++ b/presets/acou_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec1c5328751cadeed5356d4264759799ad96d33ea8dd4f8a3d0a80dd8ddb0e74
+size 15426
diff --git a/presets/acou_3.npz b/presets/acou_3.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1eb6978a203b4df5124bf745c1fde591d1864ce7
--- /dev/null
+++ b/presets/acou_3.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03f241b094a32b3f542e74374183c6d15e8b70ae73ceeafb11bfd4ee6b8b4a3a
+size 15410
diff --git a/presets/acou_4.npz b/presets/acou_4.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c0e623ffed42dd0fd089e928a79eeb25721ba6d3
--- /dev/null
+++ b/presets/acou_4.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:52b96f32863f13f84cf7ac4a27d2bc95cea70c350a037f4d1890b20b8da9501e
+size 15506
diff --git a/presets/amused.npz b/presets/amused.npz
new file mode 100644
index 0000000000000000000000000000000000000000..3d9b45ee3d7e557bb754d6564312479b92acf5fc
--- /dev/null
+++ b/presets/amused.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df3e882f3a62805b9aaf300d81822cd4eddeafee480503b7b78e32be2085fb11
+size 20882
diff --git a/presets/anger.npz b/presets/anger.npz
new file mode 100644
index 0000000000000000000000000000000000000000..26477928feb6c7da2b0bb3b29ba3122adf2a000e
--- /dev/null
+++ b/presets/anger.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:959cec6dc0b30219db0d70cdd165fe00bbdc098165cf9d67ccdd1ecf7a5da5be
+size 22090
diff --git a/presets/babara.npz b/presets/babara.npz
new file mode 100644
index 0000000000000000000000000000000000000000..9a484d8b9a6ad6a907e426eccda7b0a4e6e8884e
--- /dev/null
+++ b/presets/babara.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8106b2a98c3f70587f23ab46ed5bf73b1c9a770481c3620ab140bd3256010376
+size 11526
diff --git a/presets/bronya_1.npz b/presets/bronya_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..361939a93a9fd2c00c775bb761f4a8afd9d226a9
--- /dev/null
+++ b/presets/bronya_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02eaada2c3d58866c813887ed9f871587ef5a7e976abc23382ce46a17b208001
+size 18106
diff --git a/presets/dingzhen.npz b/presets/dingzhen.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4da9178da67661edeb4868d9e251b016db846511
--- /dev/null
+++ b/presets/dingzhen.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d19167c65eefef5e42dfaa1919ff5149ca0a93cb052396a47d1f42f9865f5f8
+size 18154
diff --git a/presets/disgust.npz b/presets/disgust.npz
new file mode 100644
index 0000000000000000000000000000000000000000..fa775736b826d61213653a808855eaf8d263c61d
--- /dev/null
+++ b/presets/disgust.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4443f0a395072700f2ec6101dbf2ad9d28968aa3e5809e384ea131832f894d7f
+size 39386
diff --git a/presets/emo_amused.npz b/presets/emo_amused.npz
new file mode 100644
index 0000000000000000000000000000000000000000..545712470a78ae6b3f91308779b612c9b8ef33b4
--- /dev/null
+++ b/presets/emo_amused.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:38be2ea16dc79beae68b6c885d99d4dad516acbd88ed5ed6991dd97301f2f30b
+size 15378
diff --git a/presets/emo_anger.npz b/presets/emo_anger.npz
new file mode 100644
index 0000000000000000000000000000000000000000..8cbf61bb2353db8a1337debe68e6c5113099fe46
--- /dev/null
+++ b/presets/emo_anger.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3261c3bdd5b7b4be9783d9293ee3d871be9d9d791f2b3a8bf62a1a0ee0ed93e6
+size 15434
diff --git a/presets/emo_neutral.npz b/presets/emo_neutral.npz
new file mode 100644
index 0000000000000000000000000000000000000000..ce1da3b25448c86b3ec2b2d2d0f19c56bca789c8
--- /dev/null
+++ b/presets/emo_neutral.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2188c4154692316ed7c0edee3aa3dd8678be36f355ee2b8c8a3a6412c3673ba9
+size 15578
diff --git a/presets/emo_sleepy.npz b/presets/emo_sleepy.npz
new file mode 100644
index 0000000000000000000000000000000000000000..b39ef24ea839f0a67663610473c2026751b96a72
--- /dev/null
+++ b/presets/emo_sleepy.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a53255890beaf4ed339e1967f0837fdb87c34c9f7e18bf77cd4b08eba176963
+size 15370
diff --git a/presets/en2zh_tts_1.npz b/presets/en2zh_tts_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..e73db03e27078932694dfdb6df5cc849c6bcc3d7
--- /dev/null
+++ b/presets/en2zh_tts_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d4de4ed055448ea54f7b40091afae565197f960d954279035ac537ea5a01bc4
+size 44354
diff --git a/presets/en2zh_tts_2.npz b/presets/en2zh_tts_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d15ad2188a0f5fead60165d86c825dec7a914ac2
--- /dev/null
+++ b/presets/en2zh_tts_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dcc066ea104daa27d1552fe76574d09359d56fa892241581cc19e931a696eca9
+size 24178
diff --git a/presets/en2zh_tts_3.npz b/presets/en2zh_tts_3.npz
new file mode 100644
index 0000000000000000000000000000000000000000..f0aa9306b71c23cfadfd6eae0bb0b7a84084fade
--- /dev/null
+++ b/presets/en2zh_tts_3.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7468944e6d0ed7f2da033e8037be07dbafc76bd1ed7c0f5996d85ff45aacda11
+size 21410
diff --git a/presets/en2zh_tts_4.npz b/presets/en2zh_tts_4.npz
new file mode 100644
index 0000000000000000000000000000000000000000..b52465fadebb7f7f163a26f2e9d9633f703ad039
--- /dev/null
+++ b/presets/en2zh_tts_4.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0fd8d0914e74769114310e9504d68d6b7b0c6aacd46763478cbfd4f9631ad54a
+size 43826
diff --git a/presets/fuxuan_2.npz b/presets/fuxuan_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..aaeb7f8bc5af0680a2d64e452e1d029f592aa44b
--- /dev/null
+++ b/presets/fuxuan_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:17b90388d179ae309e1f577c28c3f10d9bed73c6ccbffdd829c00568eb3941e6
+size 50330
diff --git a/presets/librispeech_1.npz b/presets/librispeech_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..e2480cc12a6a526df5c552700f1507675cee62d8
--- /dev/null
+++ b/presets/librispeech_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:415b244e43b45291fd651d71f15bb7a31c244e2054988c436f6bbc04465c6099
+size 15650
diff --git a/presets/librispeech_2.npz b/presets/librispeech_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..0eed46188be3dea3293903a13daa718ab0c802c1
--- /dev/null
+++ b/presets/librispeech_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd74e77370248b025321b9dbae25b1572f13f98da63255e384d382d2b0c78227
+size 15418
diff --git a/presets/librispeech_3.npz b/presets/librispeech_3.npz
new file mode 100644
index 0000000000000000000000000000000000000000..fbaa57d5d3c106ea9a77af43a6a2a3c0d3045773
--- /dev/null
+++ b/presets/librispeech_3.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1eceb3f4cc0f3a8856b5e3b5f1ca28c428d75305b1452da1ecf4013bc358ccaa
+size 15634
diff --git a/presets/librispeech_4.npz b/presets/librispeech_4.npz
new file mode 100644
index 0000000000000000000000000000000000000000..3516ee92a587b51c645856122a12503386f5dd28
--- /dev/null
+++ b/presets/librispeech_4.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3939dde39f5e65bc01f5eba9acb7b8329465aaca3c38edf1b240aa714e687960
+size 15594
diff --git a/presets/neutral.npz b/presets/neutral.npz
new file mode 100644
index 0000000000000000000000000000000000000000..6af010decf0d7459e76a0764a6495ecd9758c524
--- /dev/null
+++ b/presets/neutral.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8a63993526ffdc788a711b512d07a8b1c816151a1edb63913d0bfb48c2ea380
+size 21050
diff --git a/presets/paimon_1.npz b/presets/paimon_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..8e9cf23f35e99a3791ea54ac8f0700dd188d9db5
--- /dev/null
+++ b/presets/paimon_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:452d5e0cd3a060db521bd65a16af818a6177f357801402aa5581eceb2c24039a
+size 13762
diff --git a/presets/rosalia.npz b/presets/rosalia.npz
new file mode 100644
index 0000000000000000000000000000000000000000..800162152c8207d2c491b8c4018bf177ab6f8c8a
--- /dev/null
+++ b/presets/rosalia.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af87ebe283bbb7b527c6c0ff0a02a315416485677fe23330040c2766fa9af919
+size 11414
diff --git a/presets/seel.npz b/presets/seel.npz
new file mode 100644
index 0000000000000000000000000000000000000000..095b1754f23a1030296b2a8f8f90b230e4b6dc1e
--- /dev/null
+++ b/presets/seel.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:44ad2e900df3625f9753e949dc5a7d8479c4091e24cb18cbf46e34e29498d952
+size 13554
diff --git a/presets/sleepiness.npz b/presets/sleepiness.npz
new file mode 100644
index 0000000000000000000000000000000000000000..5b6bfc27f36658c0f62272ce30f357fec5911f97
--- /dev/null
+++ b/presets/sleepiness.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0f866a278a10c7b6b494fb62589a9d8fef778ccf272df3b0d5510f45b243b5c
+size 33218
diff --git a/presets/vctk_1.npz b/presets/vctk_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c23c917cdcc846bbd047edd409b182d236aa6d28
--- /dev/null
+++ b/presets/vctk_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1c9df2ea8c2bc919c0ac50f8e05950bb4e831de69b33a7fb12d584da5b2512f2
+size 15530
diff --git a/presets/vctk_2.npz b/presets/vctk_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..a671e453cd54cf7345c5a1199b70280f877dae0d
--- /dev/null
+++ b/presets/vctk_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc84744435a304b3e700b8b1ab94c3b891db3056bd55a0f9dd99eff284016efa
+size 15458
diff --git a/presets/vctk_3.npz b/presets/vctk_3.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1c045ead518d9f37699a0b59ebe57296e0542aef
--- /dev/null
+++ b/presets/vctk_3.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec0d528c6ae9c8f32b02ca6b57aa565b9fe63f401fd04f2632ed7e536699b9ac
+size 15450
diff --git a/presets/vctk_4.npz b/presets/vctk_4.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1fbfbbdd4ef4e292e24f7276defadaefdcf0e98b
--- /dev/null
+++ b/presets/vctk_4.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8ff2b71254ae00be6e42ad206c7616d168bd41582837e9eeb4d6cd669bd0b140
+size 15330
diff --git a/presets/yaesakura.npz b/presets/yaesakura.npz
new file mode 100644
index 0000000000000000000000000000000000000000..3f6b151870c881c61eb232dbb28c1403a67532df
--- /dev/null
+++ b/presets/yaesakura.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b388a18d286b4ba13d45bae373a716c0010dc40ae9c940d53b5a04cbc64e95ff
+size 12442
diff --git a/presets/zh2en_tts_1.npz b/presets/zh2en_tts_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..bbd2a9c750af5b6cac656b01ef36c2dd3ee766f7
--- /dev/null
+++ b/presets/zh2en_tts_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:07bff150ad145f9b06f0e7cbf9b0ee4d9e926600efa0d129bd831c8b2993c2b0
+size 23546
diff --git a/presets/zh2en_tts_2.npz b/presets/zh2en_tts_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..644f6cf976b91b284316a5e4513b72980d7557a8
--- /dev/null
+++ b/presets/zh2en_tts_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0257d0782578c7813c3f43b5e93c0e681f9ea42fe76775d5a4f4fea64609b03e
+size 20170
diff --git a/presets/zh2en_tts_3.npz b/presets/zh2en_tts_3.npz
new file mode 100644
index 0000000000000000000000000000000000000000..fe2ce9d14ae1af4ee307d1b0a109c141141957d9
--- /dev/null
+++ b/presets/zh2en_tts_3.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5da48e060d15f391767bffe1d528bfbc782a562413feed2e9bd2cafa82bf644a
+size 17906
diff --git a/presets/zh2en_tts_4.npz b/presets/zh2en_tts_4.npz
new file mode 100644
index 0000000000000000000000000000000000000000..693e32dc6f27b91270c8c466b1a6671fb0ed7054
--- /dev/null
+++ b/presets/zh2en_tts_4.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bda7a70ed9b03d8f1ff99d2444ea1df476a8deaf75633aa3b3f6cf3f45ae7e5e
+size 33682
diff --git a/requirements.txt b/requirements.txt
index fd4f265658ad0d8c180146f251e79f3acb3a1062..06d5958b402f86f2fe123caff1f0b4810bec178b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,10 @@
+soundfile
numpy
-torch
-torchaudio
+torch==2.0.1
+torchvision==0.15.2
tokenizers
encodec
+langid
unidecode
pyopenjtalk
pypinyin
@@ -11,4 +13,8 @@ cn2an
jieba
eng_to_ipa
jieba
-openai-whisper
\ No newline at end of file
+SudachiPy
+openai-whisper
+phonemizer
+matplotlib
+gradio
\ No newline at end of file
diff --git a/utils/__pycache__/__init__.cpython-38.pyc b/utils/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fe3ce17582a9b3e70aa7dcad5bf2dfd6b7fadead
Binary files /dev/null and b/utils/__pycache__/__init__.cpython-38.pyc differ
diff --git a/utils/g2p/__pycache__/__init__.cpython-38.pyc b/utils/g2p/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9f5652536aec5cb4909f4b8c4d6f82ef784263f4
Binary files /dev/null and b/utils/g2p/__pycache__/__init__.cpython-38.pyc differ
diff --git a/utils/g2p/__pycache__/cleaners.cpython-38.pyc b/utils/g2p/__pycache__/cleaners.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8a0a76161b1a4d572b0b265e5a8b10a0dca9e4e6
Binary files /dev/null and b/utils/g2p/__pycache__/cleaners.cpython-38.pyc differ
diff --git a/utils/g2p/__pycache__/english.cpython-38.pyc b/utils/g2p/__pycache__/english.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0e4acb6d4ef61dffaaac2ee7068dea87565667c3
Binary files /dev/null and b/utils/g2p/__pycache__/english.cpython-38.pyc differ
diff --git a/utils/g2p/__pycache__/japanese.cpython-38.pyc b/utils/g2p/__pycache__/japanese.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6c9d9e630232f20ea50434d8d1cab07658eb5b70
Binary files /dev/null and b/utils/g2p/__pycache__/japanese.cpython-38.pyc differ
diff --git a/utils/g2p/__pycache__/mandarin.cpython-38.pyc b/utils/g2p/__pycache__/mandarin.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b72016aa9b874286d2b4e824343406d91db2c773
Binary files /dev/null and b/utils/g2p/__pycache__/mandarin.cpython-38.pyc differ
diff --git a/utils/g2p/__pycache__/symbols.cpython-38.pyc b/utils/g2p/__pycache__/symbols.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..922be9f846d866df964da05509553e79476cc119
Binary files /dev/null and b/utils/g2p/__pycache__/symbols.cpython-38.pyc differ
diff --git a/utils/generation.py b/utils/generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..30ed3164c69b7fc864edc77b30228d5ae279ca54
--- /dev/null
+++ b/utils/generation.py
@@ -0,0 +1,256 @@
+import os
+import torch
+import gdown
+import logging
+import langid
+langid.set_languages(['en', 'zh', 'ja'])
+
+import pathlib
+import platform
+if platform.system().lower() == 'windows':
+ temp = pathlib.PosixPath
+ pathlib.PosixPath = pathlib.WindowsPath
+elif platform.system().lower() == 'linux':
+ temp = pathlib.WindowsPath
+ pathlib.WindowsPath = pathlib.PosixPath
+
+import numpy as np
+from data.tokenizer import (
+ AudioTokenizer,
+ tokenize_audio,
+)
+from data.collation import get_text_token_collater
+from models.vallex import VALLE
+from utils.g2p import PhonemeBpeTokenizer
+from utils.sentence_cutter import split_text_into_sentences
+
+from macros import *
+
+device = torch.device("cpu")
+if torch.cuda.is_available():
+ device = torch.device("cuda", 0)
+
+url = 'https://drive.google.com/file/d/10gdQWvP-K_e1undkvv0p2b7SU6I4Egyl/view?usp=sharing'
+
+checkpoints_dir = "./checkpoints/"
+
+model_checkpoint_name = "vallex-checkpoint.pt"
+
+model = None
+
+codec = None
+
+text_tokenizer = PhonemeBpeTokenizer(tokenizer_path="./utils/g2p/bpe_69.json")
+text_collater = get_text_token_collater()
+
+def preload_models():
+ global model, codec
+ if not os.path.exists(checkpoints_dir): os.mkdir(checkpoints_dir)
+ if not os.path.exists(os.path.join(checkpoints_dir, model_checkpoint_name)):
+ gdown.download(id="10gdQWvP-K_e1undkvv0p2b7SU6I4Egyl", output=os.path.join(checkpoints_dir, model_checkpoint_name), quiet=False)
+ # VALL-E
+ model = VALLE(
+ N_DIM,
+ NUM_HEAD,
+ NUM_LAYERS,
+ norm_first=True,
+ add_prenet=False,
+ prefix_mode=PREFIX_MODE,
+ share_embedding=True,
+ nar_scale_factor=1.0,
+ prepend_bos=True,
+ num_quantizers=NUM_QUANTIZERS,
+ ).to(device)
+ checkpoint = torch.load(os.path.join(checkpoints_dir, model_checkpoint_name), map_location='cpu')
+ missing_keys, unexpected_keys = model.load_state_dict(
+ checkpoint["model"], strict=True
+ )
+ assert not missing_keys
+ model.eval()
+
+ # Encodec
+ codec = AudioTokenizer(device)
+
+@torch.no_grad()
+def generate_audio(text, prompt=None, language='auto', accent='no-accent'):
+ global model, codec, text_tokenizer, text_collater
+ text = text.replace("\n", "").strip(" ")
+ # detect language
+ if language == "auto":
+ language = langid.classify(text)[0]
+ lang_token = lang2token[language]
+ lang = token2lang[lang_token]
+ text = lang_token + text + lang_token
+
+ # load prompt
+ if prompt is not None:
+ prompt_path = prompt
+ if not os.path.exists(prompt_path):
+ prompt_path = "./presets/" + prompt + ".npz"
+ if not os.path.exists(prompt_path):
+ prompt_path = "./customs/" + prompt + ".npz"
+ if not os.path.exists(prompt_path):
+ raise ValueError(f"Cannot find prompt {prompt}")
+ prompt_data = np.load(prompt_path)
+ audio_prompts = prompt_data['audio_tokens']
+ text_prompts = prompt_data['text_tokens']
+ lang_pr = prompt_data['lang_code']
+ lang_pr = code2lang[int(lang_pr)]
+
+ # numpy to tensor
+ audio_prompts = torch.tensor(audio_prompts).type(torch.int32).to(device)
+ text_prompts = torch.tensor(text_prompts).type(torch.int32)
+ else:
+ audio_prompts = torch.zeros([1, 0, NUM_QUANTIZERS]).type(torch.int32).to(device)
+ text_prompts = torch.zeros([1, 0]).type(torch.int32)
+ lang_pr = lang if lang != 'mix' else 'en'
+
+ enroll_x_lens = text_prompts.shape[-1]
+ logging.info(f"synthesize text: {text}")
+ phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip())
+ text_tokens, text_tokens_lens = text_collater(
+ [
+ phone_tokens
+ ]
+ )
+ text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
+ text_tokens_lens += enroll_x_lens
+ # accent control
+ lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
+ encoded_frames = model.inference(
+ text_tokens.to(device),
+ text_tokens_lens.to(device),
+ audio_prompts,
+ enroll_x_lens=enroll_x_lens,
+ top_k=-100,
+ temperature=1,
+ prompt_language=lang_pr,
+ text_language=langs if accent == "no-accent" else lang,
+ )
+ samples = codec.decode(
+ [(encoded_frames.transpose(2, 1), None)]
+ )
+
+ return samples[0][0].cpu().numpy()
+
+@torch.no_grad()
+def generate_audio_from_long_text(text, prompt=None, language='auto', accent='no-accent', mode='sliding-window'):
+ """
+ For long audio generation, two modes are available.
+ fixed-prompt: This mode will keep using the same prompt the user has provided, and generate audio sentence by sentence.
+ sliding-window: This mode will use the last sentence as the prompt for the next sentence, but has some concern on speaker maintenance.
+ """
+ global model, codec, text_tokenizer, text_collater
+ if prompt is None or prompt == "":
+ mode = 'sliding-window' # If no prompt is given, use sliding-window mode
+ sentences = split_text_into_sentences(text)
+ # detect language
+ if language == "auto":
+ language = langid.classify(text)[0]
+
+ # if initial prompt is given, encode it
+ if prompt is not None and prompt != "":
+ prompt_path = prompt
+ if not os.path.exists(prompt_path):
+ prompt_path = "./presets/" + prompt + ".npz"
+ if not os.path.exists(prompt_path):
+ prompt_path = "./customs/" + prompt + ".npz"
+ if not os.path.exists(prompt_path):
+ raise ValueError(f"Cannot find prompt {prompt}")
+ prompt_data = np.load(prompt_path)
+ audio_prompts = prompt_data['audio_tokens']
+ text_prompts = prompt_data['text_tokens']
+ lang_pr = prompt_data['lang_code']
+ lang_pr = code2lang[int(lang_pr)]
+
+ # numpy to tensor
+ audio_prompts = torch.tensor(audio_prompts).type(torch.int32).to(device)
+ text_prompts = torch.tensor(text_prompts).type(torch.int32)
+ else:
+ audio_prompts = torch.zeros([1, 0, NUM_QUANTIZERS]).type(torch.int32).to(device)
+ text_prompts = torch.zeros([1, 0]).type(torch.int32)
+ lang_pr = language if language != 'mix' else 'en'
+ if mode == 'fixed-prompt':
+ complete_tokens = torch.zeros([1, NUM_QUANTIZERS, 0]).type(torch.LongTensor).to(device)
+ for text in sentences:
+ text = text.replace("\n", "").strip(" ")
+ if text == "":
+ continue
+ lang_token = lang2token[language]
+ lang = token2lang[lang_token]
+ text = lang_token + text + lang_token
+
+ enroll_x_lens = text_prompts.shape[-1]
+ logging.info(f"synthesize text: {text}")
+ phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip())
+ text_tokens, text_tokens_lens = text_collater(
+ [
+ phone_tokens
+ ]
+ )
+ text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
+ text_tokens_lens += enroll_x_lens
+ # accent control
+ lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
+ encoded_frames = model.inference(
+ text_tokens.to(device),
+ text_tokens_lens.to(device),
+ audio_prompts,
+ enroll_x_lens=enroll_x_lens,
+ top_k=-100,
+ temperature=1,
+ prompt_language=lang_pr,
+ text_language=langs if accent == "no-accent" else lang,
+ )
+ complete_tokens = torch.cat([complete_tokens, encoded_frames.transpose(2, 1)], dim=-1)
+ samples = codec.decode(
+ [(complete_tokens, None)]
+ )
+ return samples[0][0].cpu().numpy()
+ elif mode == "sliding-window":
+ complete_tokens = torch.zeros([1, NUM_QUANTIZERS, 0]).type(torch.LongTensor).to(device)
+ original_audio_prompts = audio_prompts
+ original_text_prompts = text_prompts
+ for text in sentences:
+ text = text.replace("\n", "").strip(" ")
+ if text == "":
+ continue
+ lang_token = lang2token[language]
+ lang = token2lang[lang_token]
+ text = lang_token + text + lang_token
+
+ enroll_x_lens = text_prompts.shape[-1]
+ logging.info(f"synthesize text: {text}")
+ phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip())
+ text_tokens, text_tokens_lens = text_collater(
+ [
+ phone_tokens
+ ]
+ )
+ text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
+ text_tokens_lens += enroll_x_lens
+ # accent control
+ lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
+ encoded_frames = model.inference(
+ text_tokens.to(device),
+ text_tokens_lens.to(device),
+ audio_prompts,
+ enroll_x_lens=enroll_x_lens,
+ top_k=-100,
+ temperature=1,
+ prompt_language=lang_pr,
+ text_language=langs if accent == "no-accent" else lang,
+ )
+ complete_tokens = torch.cat([complete_tokens, encoded_frames.transpose(2, 1)], dim=-1)
+ if torch.rand(1) < 0.5:
+ audio_prompts = encoded_frames[:, :, -NUM_QUANTIZERS:]
+ text_prompts = text_tokens[:, enroll_x_lens:]
+ else:
+ audio_prompts = original_audio_prompts
+ text_prompts = original_text_prompts
+ samples = codec.decode(
+ [(complete_tokens, None)]
+ )
+ return samples[0][0].cpu().numpy()
+ else:
+ raise ValueError(f"No such mode {mode}")
\ No newline at end of file
diff --git a/utils/prompt_making.py b/utils/prompt_making.py
new file mode 100644
index 0000000000000000000000000000000000000000..93e4a3d647052df4899253fea41be22f09e006b8
--- /dev/null
+++ b/utils/prompt_making.py
@@ -0,0 +1,115 @@
+import os
+import torch
+import torchaudio
+import logging
+import langid
+import whisper
+langid.set_languages(['en', 'zh', 'ja'])
+
+import numpy as np
+from data.tokenizer import (
+ AudioTokenizer,
+ tokenize_audio,
+)
+from data.collation import get_text_token_collater
+from utils.g2p import PhonemeBpeTokenizer
+
+from macros import *
+
+text_tokenizer = PhonemeBpeTokenizer(tokenizer_path="./utils/g2p/bpe_69.json")
+text_collater = get_text_token_collater()
+
+device = torch.device("cpu")
+if torch.cuda.is_available():
+ device = torch.device("cuda", 0)
+
+codec = AudioTokenizer(device)
+
+whisper_model = None
+
+@torch.no_grad()
+def transcribe_one(model, audio_path):
+ # load audio and pad/trim it to fit 30 seconds
+ audio = whisper.load_audio(audio_path)
+ audio = whisper.pad_or_trim(audio)
+
+ # make log-Mel spectrogram and move to the same device as the model
+ mel = whisper.log_mel_spectrogram(audio).to(model.device)
+
+ # detect the spoken language
+ _, probs = model.detect_language(mel)
+ print(f"Detected language: {max(probs, key=probs.get)}")
+ lang = max(probs, key=probs.get)
+ # decode the audio
+ options = whisper.DecodingOptions(temperature=1.0, best_of=5, fp16=False if device == torch.device("cpu") else True, sample_len=150)
+ result = whisper.decode(model, mel, options)
+
+ # print the recognized text
+ print(result.text)
+
+ text_pr = result.text
+ if text_pr.strip(" ")[-1] not in "?!.,。,?!。、":
+ text_pr += "."
+ return lang, text_pr
+
+def make_prompt(name, audio_prompt_path, transcript=None):
+ global model, text_collater, text_tokenizer, codec
+ wav_pr, sr = torchaudio.load(audio_prompt_path)
+ # check length
+ if wav_pr.size(-1) / sr > 15:
+ raise ValueError(f"Prompt too long, expect length below 15 seconds, got {wav_pr / sr} seconds.")
+ if wav_pr.size(0) == 2:
+ wav_pr = wav_pr.mean(0, keepdim=True)
+ text_pr, lang_pr = make_transcript(name, wav_pr, sr, transcript)
+
+ # tokenize audio
+ encoded_frames = tokenize_audio(codec, (wav_pr, sr))
+ audio_tokens = encoded_frames[0][0].transpose(2, 1).cpu().numpy()
+
+ # tokenize text
+ phonemes, langs = text_tokenizer.tokenize(text=f"{text_pr}".strip())
+ text_tokens, enroll_x_lens = text_collater(
+ [
+ phonemes
+ ]
+ )
+
+ message = f"Detected language: {lang_pr}\n Detected text {text_pr}\n"
+
+ # save as npz file
+ save_path = os.path.join("./customs/", f"{name}.npz")
+ np.savez(save_path, audio_tokens=audio_tokens, text_tokens=text_tokens, lang_code=lang2code[lang_pr])
+ logging.info(f"Successful. Prompt saved to {save_path}")
+
+
+def make_transcript(name, wav, sr, transcript=None):
+
+ if not isinstance(wav, torch.FloatTensor):
+ wav = torch.tensor(wav)
+ if wav.abs().max() > 1:
+ wav /= wav.abs().max()
+ if wav.size(-1) == 2:
+ wav = wav.mean(-1, keepdim=False)
+ if wav.ndim == 1:
+ wav = wav.unsqueeze(0)
+ assert wav.ndim and wav.size(0) == 1
+ if transcript is None or transcript == "":
+ logging.info("Transcript not given, using Whisper...")
+ global whisper_model
+ if whisper_model is None:
+ whisper_model = whisper.load_model("medium")
+ whisper_model.to(device)
+ torchaudio.save(f"./prompts/{name}.wav", wav, sr)
+ lang, text = transcribe_one(whisper_model, f"./prompts/{name}.wav")
+ lang_token = lang2token[lang]
+ text = lang_token + text + lang_token
+ os.remove(f"./prompts/{name}.wav")
+ whisper_model.cpu()
+ else:
+ text = transcript
+ lang, _ = langid.classify(text)
+ lang_token = lang2token[lang]
+ text = lang_token + text + lang_token
+
+ torch.cuda.empty_cache()
+ return text, lang
\ No newline at end of file
diff --git a/utils/sentence_cutter.py b/utils/sentence_cutter.py
new file mode 100644
index 0000000000000000000000000000000000000000..15ec197479fff9b667ae751d092e18a9108e7626
--- /dev/null
+++ b/utils/sentence_cutter.py
@@ -0,0 +1,54 @@
+import nltk
+import jieba
+import sudachipy
+import langid
+langid.set_languages(['en', 'zh', 'ja'])
+
+def split_text_into_sentences(text):
+ if langid.classify(text)[0] == "en":
+ sentences = nltk.tokenize.sent_tokenize(text)
+
+ return sentences
+ elif langid.classify(text)[0] == "zh":
+ sentences = []
+ segs = jieba.cut(text, cut_all=False)
+ segs = list(segs)
+ start = 0
+ for i, seg in enumerate(segs):
+ if seg in ["。", "!", "?", "……"]:
+ sentences.append("".join(segs[start:i + 1]))
+ start = i + 1
+ if start < len(segs):
+ sentences.append("".join(segs[start:]))
+
+ return sentences
+ elif langid.classify(text)[0] == "ja":
+ sentences = []
+ tokenizer = sudachipy.Dictionary().create()
+ tokens = tokenizer.tokenize(text)
+ current_sentence = ""
+
+ for token in tokens:
+ current_sentence += token.surface()
+ if token.part_of_speech()[0] == "補助記号" and token.part_of_speech()[1] == "句点":
+ sentences.append(current_sentence)
+ current_sentence = ""
+
+ if current_sentence:
+ sentences.append(current_sentence)
+
+ return sentences
+
+ raise RuntimeError("It is impossible to reach here.")
+
+long_text = """
+This is a very long paragraph, so most TTS model is unable to handle it. Hence, we have to split it into several sentences. With the help of NLTK, we can split it into sentences. However, the punctuation is not preserved, so we have to add it back. How are we going to do write this code? Let's see.
+"""
+
+long_text = """
+现在我们要来尝试一下中文分句。因为很不幸的是,NLTK不支持中文分句。幸运的是,我们可以使用jieba来分句。但是,jieba分句后,标点符号会丢失,所以我们要手动添加回去。我现在正在想办法把这个例句写的更长更复杂一点,来测试jieba分句的性能。嗯......省略号,感觉不太好,因为省略号不是句号,所以jieba不会把它当作句子的结尾。会这样吗?我们来试试看。
+"""
+
+long_text = """
+これなら、英語と中国語の分句もできる。でも、日本語はどうする?まつわ、ChatGPTに僕と教えてください。ちょーと待ってください。あ、出来た!
+"""
\ No newline at end of file