import numpy as np import gradio as gr from pypinyin import lazy_pinyin from pinyin_dict import PINYIN_DICT from espnet_model_zoo.downloader import ModelDownloader from espnet2.fileio.read_text import read_label from espnet2.bin.svs_inference import SingingGenerate spks = { "singer1 (man)": 1, "singer2 (man)": 2, "singer3 (female)": 5, "singer4 (female)": 9, "singer5 (man)": 18, "singer6 (female)": 15, "singer7 (man)": 23, "singer8 (man)": 25, "singer9 (female)": 29, "singer10 (man)": 27, } def gen_song(lang, tempo, texts, durs, pitchs, spk): if lang == "zh": PRETRAIN_MODEL = "espnet/aceopencpop_svs_visinger2_40singer_pretrain" fs = 44100 text_list = lazy_pinyin(texts) # preprocess if texts is None: return (fs, np.array([0.0])), "Error: No Text provided!" if durs is None: return (fs, np.array([0.0])), "Error: No Dur provided!" if pitchs is None: return (fs, np.array([0.0])), "Error: No Pitch provided!" dur_list = durs.strip().split() pitch_list = pitchs.strip().split() if len(text_list) != len(dur_list): return (fs, np.array([0.0])), f"Error: len in text({len(text_list)}) mismatch with duration({len(dur_list)})!" if len(text_list) != len(pitch_list): return (fs, np.array([0.0])), f"Error: len in text({len(text_list)}) mismatch with pitch({len(pitch_list)})!" ## text to phoneme sybs = [] if lang == "zh": pinyin_dict = PINYIN_DICT for text in text_list: text = text.lower() if text not in pinyin_dict: return (fs, np.array([0.0])), f"Error: pinyin `{text}` is invalid!" phns = "_".join(pinyin_dict[text]) sybs.append(phns) ## pitch pitch_dict = {} with open("./midi-note.scp", "r", encoding="utf-8") as f: for line in f: items = line.strip().split() pitch_dict[items[0]] = int(items[1]) pitch_dict[items[1]] = int(items[1]) labels = [] notes = [] st = 0 for phns, dur, pitch in zip(sybs, dur_list, pitch_list): if pitch not in pitch_dict: return (fs, np.array([0.0])), f"Error: pitch `{pitch}` is invalid!" pitch = pitch_dict[pitch] dur = float(dur) phn_list = phns.split("_") lyric = "".join(phn_list) note = [st, st + dur, lyric, pitch, phns] st += dur notes.append(note) for phn in phn_list: labels.append(phn) phns_str = " ".join(labels) batch = { "score": ( int(tempo), notes, ), "text": phns_str, } # Infer device = "cpu" # device = "cuda" if torch.cuda.is_available() else "cpu" d = ModelDownloader() pretrain_downloaded = d.download_and_unpack(PRETRAIN_MODEL) svs = SingingGenerate( train_config = pretrain_downloaded["train_config"], model_file = pretrain_downloaded["model_file"], device = device ) sid = spks[spk] output_dict = svs(batch, sids=np.array([sid])) wav_info = output_dict["wav"].cpu().numpy() return (fs, wav_info), "success!" title = "Demo of Singing Voice Synthesis in Muskits-ESPnet" description = """ This is the demo page of our toolkit Muskits-ESPnet: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm.

How to use:

  1. Choose language ID. Language id
  2. Input tempo in integer
  3. Input text, duration, pitch of equal length
  4. Choose ons singer
  5. Click submit button
""" article = """

References: Muskits-ESPnet paper | espnet GitHub | pretrained model

@inproceedings{wu2024muskits,
  title = {{Muskits-ESPnet}: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm},
  author = {Yuning Wu and Jiatong Shi and Yifeng Yu and Yuxun Tang and Tao Qian and Yueqian Lin and Jionghao Han and Xinyi Bai and Shinji Watanabe and Qin Jin},
  booktitle={Proc. ACM Multimedia},
  year={2024},
}
""" # SP: silence, AP: aspirate. examples = [ ["zh", 89, "雨淋湿了SP天空AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "60 62 62 62 0 62 58 0", "singer1 (man)"], ["zh", 89, "雨淋湿了SP天空AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "C4 D4 D4 D4 rest D4 A#3 rest", "singer2 (man)"], ["zh", 89, "雨淋湿了SP天空AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "C4 D4 D4 D4 rest D4 Bb3 rest", "singer3 (female)"], ] gr.Interface( fn=gen_song, inputs=[ gr.Radio(label="language", choices=["zh"], value="zh"), gr.Textbox(label="Tempo"), gr.Textbox(label="Text"), gr.Textbox(label="Duration"), gr.Textbox(label="Pitch"), gr.Radio( label="Singer", choices=[ "singer1 (man)", "singer2 (man)", "singer3 (female)", "singer4 (female)", "singer5 (man)", "singer6 (female)", "singer7 (man)", "singer8 (man)", "singer9 (female)", "singer10 (man)", ], value="singer1 (man)" ), ], outputs=[ gr.Audio(label="Generated Song", type="numpy"), gr.Textbox(label="Running Status"), ], title=title, description=description, article=article, examples=examples, ).launch()