|
import numpy as np |
|
import gradio as gr |
|
from pypinyin import lazy_pinyin |
|
|
|
from pinyin_dict import PINYIN_DICT |
|
|
|
from espnet_model_zoo.downloader import ModelDownloader |
|
from espnet2.fileio.read_text import read_label |
|
from espnet2.bin.svs_inference import SingingGenerate |
|
|
|
|
|
spks = { |
|
"singer1 (man)": 1, |
|
"singer2 (man)": 2, |
|
"singer3 (female)": 5, |
|
"singer4 (female)": 9, |
|
"singer5 (man)": 18, |
|
"singer6 (female)": 15, |
|
"singer7 (man)": 23, |
|
"singer8 (man)": 25, |
|
"singer9 (female)": 29, |
|
"singer10 (man)": 27, |
|
} |
|
|
|
def gen_song(lang, tempo, texts, durs, pitchs, spk): |
|
if lang == "zh": |
|
PRETRAIN_MODEL = "espnet/aceopencpop_svs_visinger2_40singer_pretrain" |
|
fs = 44100 |
|
text_list = lazy_pinyin(texts) |
|
|
|
|
|
if texts is None: |
|
return (fs, np.array([0.0])), "Error: No Text provided!" |
|
if durs is None: |
|
return (fs, np.array([0.0])), "Error: No Dur provided!" |
|
if pitchs is None: |
|
return (fs, np.array([0.0])), "Error: No Pitch provided!" |
|
|
|
dur_list = durs.strip().split() |
|
pitch_list = pitchs.strip().split() |
|
|
|
if len(text_list) != len(dur_list): |
|
return (fs, np.array([0.0])), f"Error: len in text({len(text_list)}) mismatch with duration({len(dur_list)})!" |
|
if len(text_list) != len(pitch_list): |
|
return (fs, np.array([0.0])), f"Error: len in text({len(text_list)}) mismatch with pitch({len(pitch_list)})!" |
|
|
|
|
|
sybs = [] |
|
if lang == "zh": |
|
pinyin_dict = PINYIN_DICT |
|
for text in text_list: |
|
text = text.lower() |
|
if text not in pinyin_dict: |
|
return (fs, np.array([0.0])), f"Error: pinyin `{text}` is invalid!" |
|
phns = "_".join(pinyin_dict[text]) |
|
sybs.append(phns) |
|
|
|
|
|
pitch_dict = {} |
|
with open("./midi-note.scp", "r", encoding="utf-8") as f: |
|
for line in f: |
|
items = line.strip().split() |
|
pitch_dict[items[0]] = int(items[1]) |
|
pitch_dict[items[1]] = int(items[1]) |
|
|
|
labels = [] |
|
notes = [] |
|
st = 0 |
|
for phns, dur, pitch in zip(sybs, dur_list, pitch_list): |
|
if pitch not in pitch_dict: |
|
return (fs, np.array([0.0])), f"Error: pitch `{pitch}` is invalid!" |
|
pitch = pitch_dict[pitch] |
|
dur = float(dur) |
|
phn_list = phns.split("_") |
|
lyric = "".join(phn_list) |
|
note = [st, st + dur, lyric, pitch, phns] |
|
st += dur |
|
notes.append(note) |
|
for phn in phn_list: |
|
labels.append(phn) |
|
|
|
phns_str = " ".join(labels) |
|
batch = { |
|
"score": ( |
|
int(tempo), |
|
notes, |
|
), |
|
"text": phns_str, |
|
} |
|
|
|
|
|
device = "cpu" |
|
|
|
d = ModelDownloader() |
|
pretrain_downloaded = d.download_and_unpack(PRETRAIN_MODEL) |
|
svs = SingingGenerate( |
|
train_config = pretrain_downloaded["train_config"], |
|
model_file = pretrain_downloaded["model_file"], |
|
device = device |
|
) |
|
sid = spks[spk] |
|
output_dict = svs(batch, sids=np.array([sid])) |
|
wav_info = output_dict["wav"].cpu().numpy() |
|
return (fs, wav_info), "success!" |
|
|
|
|
|
title = "Demo of Singing Voice Synthesis in Muskits-ESPnet" |
|
|
|
description = """ |
|
This is the demo page of our toolkit <b>Muskits-ESPnet: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm<b>. |
|
|
|
|
|
<p>How to use:</p> |
|
<ol> |
|
<li> Choose language ID. Language id </li> |
|
<li> Input tempo in integer </li> |
|
<li> Input text, duration, pitch of equal length </li> |
|
<li> Choose ons singer </li> |
|
<li> Click submit button </li> |
|
</ol> |
|
|
|
|
|
""" |
|
|
|
article = """ |
|
<div style='margin:20px auto;'> |
|
|
|
<p>References: <a href="https://arxiv.org/abs/2409.07226">Muskits-ESPnet paper</a> | |
|
<a href="https://github.com/espnet/espnet">espnet GitHub</a> | |
|
<a href="https://huggingface.co/espnet/aceopencpop_svs_visinger2_40singer_pretrain">pretrained model</a></p> |
|
|
|
<pre> |
|
@inproceedings{wu2024muskits, |
|
title = {{Muskits-ESPnet}: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm}, |
|
author = {Yuning Wu and Jiatong Shi and Yifeng Yu and Yuxun Tang and Tao Qian and Yueqian Lin and Jionghao Han and Xinyi Bai and Shinji Watanabe and Qin Jin}, |
|
booktitle={Proc. ACM Multimedia}, |
|
year={2024}, |
|
} |
|
</pre> |
|
|
|
</div> |
|
""" |
|
|
|
|
|
|
|
examples = [ |
|
["zh", 89, "雨淋湿了SP天空AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "60 62 62 62 0 62 58 0", "singer1 (man)"], |
|
["zh", 89, "雨淋湿了SP天空AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "C4 D4 D4 D4 rest D4 A#3 rest", "singer2 (man)"], |
|
["zh", 89, "雨淋湿了SP天空AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "C4 D4 D4 D4 rest D4 Bb3 rest", "singer3 (female)"], |
|
] |
|
|
|
gr.Interface( |
|
fn=gen_song, |
|
inputs=[ |
|
gr.Radio(label="language", choices=["zh"], value="zh"), |
|
gr.Textbox(label="Tempo"), |
|
gr.Textbox(label="Text"), |
|
gr.Textbox(label="Duration"), |
|
gr.Textbox(label="Pitch"), |
|
gr.Radio( |
|
label="Singer", |
|
choices=[ |
|
"singer1 (man)", |
|
"singer2 (man)", |
|
"singer3 (female)", |
|
"singer4 (female)", |
|
"singer5 (man)", |
|
"singer6 (female)", |
|
"singer7 (man)", |
|
"singer8 (man)", |
|
"singer9 (female)", |
|
"singer10 (man)", |
|
], |
|
value="singer1 (man)" |
|
), |
|
], |
|
outputs=[ |
|
gr.Audio(label="Generated Song", type="numpy"), |
|
gr.Textbox(label="Running Status"), |
|
], |
|
title=title, |
|
description=description, |
|
article=article, |
|
examples=examples, |
|
).launch() |
|
|