import os import numpy as np import gradio as gr import pyopenjtalk from util import preprocess_input, get_tokenizer, load_pitch_dict, get_pinyin from espnet_model_zoo.downloader import ModelDownloader from espnet2.fileio.read_text import read_label from espnet2.bin.svs_inference import SingingGenerate singer_embeddings = { "singer1 (male)": "resource/singer/singer_embedding_ace-1.npy", "singer2 (female)": "resource/singer/singer_embedding_ace-2.npy", "singer3 (male)": "resource/singer/singer_embedding_ace-3.npy", "singer4 (female)": "resource/singer/singer_embedding_ace-8.npy", "singer4 (male)": "resource/singer/singer_embedding_ace-7.npy", "singer6 (female)": "resource/singer/singer_embedding_itako.npy", "singer7 (male)": "resource/singer/singer_embedding_ofuton.npy", "singer8 (female)": "resource/singer/singer_embedding_kising_orange.npy", "singer9 (male)": "resource/singer/singer_embedding_m4singer_Tenor-1.npy", "singer10 (female)": "resource/singer/singer_embedding_m4singer_Alto-4.npy", } langs = { "zh": 2, "jp": 1, } def gen_song(lang, texts, durs, pitchs, spk): fs = 44100 tempo = 120 PRETRAIN_MODEL = "TangRain/mixdata_svs_visinger2_spkembed_lang_pretrained" # pretrain_downloaded = { # "train_config": "/data7/tyx/pretrained_model/mixdata_svs_visinger2_spkembed_lang_pretrained/exp/svs_train_visinger2_spk_embed_lang_raw_phn_None_mix/config.yaml", # "model_file": "/data7/tyx/pretrained_model/mixdata_svs_visinger2_spkembed_lang_pretrained/exp/svs_train_visinger2_spk_embed_lang_raw_phn_None_mix/500epoch.pth", # } if texts is None: return (fs, np.array([0.0])), "Error: No Text provided!" if durs is None: return (fs, np.array([0.0])), "Error: No Dur provided!" if pitchs is None: return (fs, np.array([0.0])), "Error: No Pitch provided!" # preprocess if lang == "zh": texts = preprocess_input(texts, "") text_list = get_pinyin(texts) elif lang == "jp": texts = preprocess_input(texts, " ") text_list = texts.strip().split() durs = preprocess_input(durs, " ") dur_list = durs.strip().split() pitchs = preprocess_input(pitchs, " ") pitch_list = pitchs.strip().split() if len(text_list) != len(dur_list): return (fs, np.array([0.0])), f"Error: len in text({len(text_list)}) mismatch with duration({len(dur_list)})!" if len(text_list) != len(pitch_list): return (fs, np.array([0.0])), f"Error: len in text({len(text_list)}) mismatch with pitch({len(pitch_list)})!" ## text to phoneme tokenizer = get_tokenizer(lang) sybs = [] for text in text_list: if text == "AP" or text == "SP": rev = [text] elif text == "-" or text == "——": rev = [text] else: rev = tokenizer(text) rev = [phn + f"@{lang}" for phn in rev] if rev == False: return (fs, np.array([0.0])), f"Error: text `{text}` is invalid!" phns = "_".join(rev) sybs.append(phns) pitch_dict = load_pitch_dict() labels = [] notes = [] st = 0 pre_phn = "" for phns, dur, pitch in zip(sybs, dur_list, pitch_list): if phns == "-" or phns == "——": phns = pre_phn if pitch not in pitch_dict: return (fs, np.array([0.0])), f"Error: pitch `{pitch}` is invalid!" pitch = pitch_dict[pitch] phn_list = phns.split("_") lyric = "".join(phn_list) dur = float(dur) note = [st, st + dur, lyric, pitch, phns] st += dur notes.append(note) for phn in phn_list: labels.append(phn) pre_phn = labels[-1] phns_str = " ".join(labels) batch = { "score": ( int(tempo), notes, ), "text": phns_str, } # print(batch) # return (fs, np.array([0.0])), "success!" # Infer device = "cpu" # device = "cuda" if torch.cuda.is_available() else "cpu" d = ModelDownloader() pretrain_downloaded = d.download_and_unpack(PRETRAIN_MODEL) svs = SingingGenerate( train_config = pretrain_downloaded["train_config"], model_file = pretrain_downloaded["model_file"], device = device ) # sid = spks[spk] lid = langs[lang] spk_embed = np.load(singer_embeddings[spk]) # output_dict = svs(batch, sids=np.array([sid])) output_dict = svs(batch, lids=np.array([lid]), spembs=spk_embed) wav_info = output_dict["wav"].cpu().numpy() return (fs, wav_info), "success!" title = "Demo of Singing Voice Synthesis in Muskits-ESPnet" description = """
This is the demo page of our toolkit Muskits-ESPnet: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm.
Singing Voice Synthesis (SVS) takes a music score as input and generates singing vocal with the voice of a specific singer.\n Music score contains information about lyrics, as well as duration and pitch of each word in lyrics.
References: Muskits-ESPnet paper | espnet GitHub | pretrained model
@inproceedings{wu2024muskits, title = {{Muskits-ESPnet}: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm}, author = {Yuning Wu and Jiatong Shi and Yifeng Yu and Yuxun Tang and Tao Qian and Yueqian Lin and Jionghao Han and Xinyi Bai and Shinji Watanabe and Qin Jin}, booktitle={Proceedings of the 32st ACM International Conference on Multimedia}, year={2024}, }