Spaces:

TangRain
/

muskits-espnet-svs-demo

Running

File size: 11,725 Bytes

import os
import numpy as np
import gradio as gr
import pyopenjtalk
from util import preprocess_input, get_tokenizer, load_pitch_dict, get_pinyin

from espnet_model_zoo.downloader import ModelDownloader
from espnet2.fileio.read_text import read_label
from espnet2.bin.svs_inference import SingingGenerate


singer_embeddings = {
    "singer1 (male)": "resource/singer/singer_embedding_ace-1.npy",
    "singer2 (female)": "resource/singer/singer_embedding_ace-2.npy",
    "singer3 (male)": "resource/singer/singer_embedding_ace-3.npy",
    "singer4 (female)": "resource/singer/singer_embedding_ace-8.npy",
    "singer4 (male)": "resource/singer/singer_embedding_ace-7.npy",
    "singer6 (female)": "resource/singer/singer_embedding_itako.npy",
    "singer7 (male)": "resource/singer/singer_embedding_ofuton.npy",
    "singer8 (female)": "resource/singer/singer_embedding_kising_orange.npy",
    "singer9 (male)": "resource/singer/singer_embedding_m4singer_Tenor-1.npy",
    "singer10 (female)": "resource/singer/singer_embedding_m4singer_Alto-4.npy",
}

langs = {
    "zh": 2,
    "jp": 1,
}

def gen_song(lang, texts, durs, pitchs, spk):
    fs = 44100
    tempo = 120
    PRETRAIN_MODEL = "TangRain/mixdata_svs_visinger2_spkembed_lang_pretrained"
    # pretrain_downloaded = {
    #     "train_config": "/data7/tyx/pretrained_model/mixdata_svs_visinger2_spkembed_lang_pretrained/exp/svs_train_visinger2_spk_embed_lang_raw_phn_None_mix/config.yaml",
    #     "model_file": "/data7/tyx/pretrained_model/mixdata_svs_visinger2_spkembed_lang_pretrained/exp/svs_train_visinger2_spk_embed_lang_raw_phn_None_mix/500epoch.pth",
    # }
    if texts is None:
        return (fs, np.array([0.0])), "Error: No Text provided!"
    if durs is None:
        return (fs, np.array([0.0])), "Error: No Dur provided!"
    if pitchs is None:
        return (fs, np.array([0.0])), "Error: No Pitch provided!"

    # preprocess
    if lang == "zh":
        texts = preprocess_input(texts, "")
        text_list = get_pinyin(texts)
    elif lang == "jp":
        texts = preprocess_input(texts, " ")
        text_list = texts.strip().split()
    durs = preprocess_input(durs, " ")
    dur_list = durs.strip().split()
    pitchs = preprocess_input(pitchs, " ")
    pitch_list = pitchs.strip().split()

    if len(text_list) != len(dur_list):
        return (fs, np.array([0.0])), f"Error: len in text({len(text_list)}) mismatch with duration({len(dur_list)})!"
    if len(text_list) != len(pitch_list):
        return (fs, np.array([0.0])), f"Error: len in text({len(text_list)}) mismatch with pitch({len(pitch_list)})!"

    ## text to phoneme
    tokenizer = get_tokenizer(lang)
    sybs = []
    for text in text_list:
        if text == "AP" or text == "SP":
            rev = [text]
        elif text == "-" or text == "——":
            rev = [text]
        else:
            rev = tokenizer(text)
            rev = [phn + f"@{lang}" for phn in rev]
        if rev == False:
            return (fs, np.array([0.0])), f"Error: text `{text}` is invalid!"
        phns = "_".join(rev)
        sybs.append(phns)

    pitch_dict = load_pitch_dict()

    labels = []
    notes = []
    st = 0
    pre_phn = ""
    for phns, dur, pitch in zip(sybs, dur_list, pitch_list):
        if phns == "-" or phns == "——":
            phns = pre_phn
        if pitch not in pitch_dict:
            return (fs, np.array([0.0])), f"Error: pitch `{pitch}` is invalid!"
        pitch = pitch_dict[pitch]
        phn_list = phns.split("_")
        lyric = "".join(phn_list)
        dur = float(dur)
        note = [st, st + dur, lyric, pitch, phns]
        st += dur
        notes.append(note)
        for phn in phn_list:
            labels.append(phn)
        pre_phn = labels[-1]

    phns_str = " ".join(labels)
    batch = {
        "score": (
            int(tempo),
            notes,
        ),
        "text": phns_str,
    }
    # print(batch)
    # return (fs, np.array([0.0])), "success!"

    # Infer
    device = "cpu"
    # device = "cuda" if torch.cuda.is_available() else "cpu"
    d = ModelDownloader()
    pretrain_downloaded = d.download_and_unpack(PRETRAIN_MODEL)
    svs = SingingGenerate(
        train_config = pretrain_downloaded["train_config"],
        model_file = pretrain_downloaded["model_file"],
        device = device
    )
    # sid = spks[spk]
    lid = langs[lang]
    spk_embed = np.load(singer_embeddings[spk])
    # output_dict = svs(batch, sids=np.array([sid]))
    output_dict = svs(batch, lids=np.array([lid]), spembs=spk_embed)
    wav_info = output_dict["wav"].cpu().numpy()
    return (fs, wav_info), "success!"


title = "Demo of Singing Voice Synthesis in Muskits-ESPnet"

description = """
<div style="font-size: 20px; ">
  <p>This is the demo page of our toolkit <b>Muskits-ESPnet: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm</b>.</p>
  <p>Singing Voice Synthesis (SVS) takes a music score as input and generates singing vocal with the voice of a specific singer.\n
  Music score contains information about lyrics, as well as duration and pitch of each word in lyrics.</p>

  <h1>How to use:</h1>
  <ol>
    <li> <b>Choose language ID</b>: "zh" indicates lyrics input in Chinese, and "jp" indicates lyrics input in Japanese. </li>
    <li> <b>Input lyrics</b>:
        <ul>
            <li> Lyrics use Chinese characters when the language is 'zh' and hiragana when the language is 'jp'. </li>
            <li> Special characters such as 'AP' (breath), 'SP' (silence), and '-' (slur, only for 'zh') can also be used. </li>
            <li> Lyrics sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
        </ul>
    </li>
    <li> <b>Input durations</b>: 
        <ul>
            <li> Length of duration sequence should <b>be same as lyric sequence</b>, with each duration corresponding to the respective lyric. </li>
            <li> Durations sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
        </ul>
    </li>
    <li> <b>Input pitches</b>:
        <ul>
            <li> Length of pitch sequence should <b>be same as lyric sequence</b>, with each pitch corresponding to the respective lyric. </li>
            <li> Pitches sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
        </ul>
    </li>
    <li> <b>Choose one singer</b> </li>
    <li> <b>Click submit button</b> </li>
  </ol>

  <h1>Notice:</h1>
  <ul>
    <li> Values outside this range may result in suboptimal generation quality! </li>
  </ul>
</div>
"""

article = """
<div style='margin:20px auto;'>

<p>References: <a href="https://arxiv.org/abs/2409.07226">Muskits-ESPnet paper</a> |
<a href="https://github.com/espnet/espnet">espnet GitHub</a> |
<a href="https://huggingface.co/espnet/mixdata_svs_visinger2_spkembed_lang_pretrained">pretrained model</a></p>

<pre>
@inproceedings{wu2024muskits,
  title = {{Muskits-ESPnet}: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm},
  author = {Yuning Wu and Jiatong Shi and Yifeng Yu and Yuxun Tang and Tao Qian and Yueqian Lin and Jionghao Han and Xinyi Bai and Shinji Watanabe and Qin Jin},
  booktitle={Proceedings of the 32st ACM International Conference on Multimedia},
  year={2024},
}
</pre>

</div>
"""


# SP: silence, AP: aspirate.
examples = [
    ["zh", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "60 62 62 62 0 62 58 0\n58 58 0 58 58 63 0", "singer1 (male)"],
    ["zh", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "C4 D4 D4 D4 rest D4 A#3 rest\nA#3 A#3 rest A#3 A#3 D#4 rest", "singer1 (male)"],
    ["zh", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "C#4 D#4 D#4 D#4 rest D#4 B3 rest\nB3 B3 rest B3 B3 E4 rest", "singer1 (male)"],
    ["zh", "你 说 你 不 SP 懂\n 为 何 在 这 时 牵 手 AP", "0.11 0.33 0.29 0.13 0.15 0.48\n0.24 0.18 0.34 0.15 0.27 0.28 0.63 0.44", "63 63 63 63 0 63\n62 62 62 63 65 63 62 0", "singer1 (male)"],
    ["zh", "你 说 你 不 SP 懂\n 为 何 在 这 时 牵 手 AP", "0.23 0.66 0.58 0.27 0.3 0.97\n0.48 0.36 0.69 0.3 0.53 0.56 1.27 0.89", "63 63 63 63 0 63\n62 62 62 63 65 63 62 0", "singer1 (male)"],
    ["zh", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP\n你 说 你 不 SP 懂\n 为 何 在 这 时 牵 手 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34\n0.11 0.33 0.29 0.13 0.15 0.48\n0.24 0.18 0.34 0.15 0.27 0.28 0.63 0.44", "60 62 62 62 0 62 58 0\n58 58 0 58 58 63 0\n63 63 63 63 0 63\n62 62 62 63 65 63 62 0", "singer1 (male)"],
    ["zh", "修 炼 爱 情 的 心 酸 SP AP", "0.42 0.21 0.19 0.28 0.22 0.33 1.53 0.1 0.29", "68 70 68 66 63 68 68 0 0", "singer2 (female)"],
    ["zh", "学 会 放 好 以 前 的 渴 望 SP AP", "0.3 0.22 0.29 0.27 0.25 0.44 0.54 0.29 1.03 0.08 0.39", "68 70 68 66 61 68 68 65 66 0 0", "singer2 (female)"],
    ["zh", "SP 你 看 着 车 窗 - SP", " 0.41 0.96 0.7 0.64 1.12 1.14 1.04 0.29", "0 60 60 62 60 64 65 0", "singer3 (male)"],
    ["jp", "い じ ん さ ん に つ れ ら れ て", "0.6 0.3 0.3 0.3 0.3 0.6 0.6 0.3 0.3 0.6 0.23", "60 60 60 56 56 56 55 55 55 53 56", "singer8 (female)"],
    ["jp", "い じ ん さ ん に つ れ ら れ て", "0.6 0.3 0.3 0.3 0.3 0.6 0.6 0.3 0.3 0.6 0.23", "62 62 62 58 58 58 57 57 57 55 58", "singer8 (female)"],
    ["jp", "い じ ん さ ん に つ れ ら れ て", "1.2 0.6 0.6 0.6 0.6 1.2 1.2 0.6 0.6 1.2 0.45", "60 60 60 56 56 56 55 55 55 53 56", "singer8 (female)"],
    ["jp", "い じ ん さ ん に つ れ ら れ て", "0.3 0.15 0.15 0.15 0.15 0.3 0.3 0.15 0.15 0.3 0.11", "60 60 60 56 56 56 55 55 55 53 56", "singer8 (female)"],
    ["jp", "きっ と と べ ば そ ら ま で と ど く AP", "0.39 2.76 0.2 0.2 0.39 0.39 0.2 0.2 0.39 0.2 0.2 0.59 1.08", "64 71 68 69 71 71 69 68 66 68 69 68 0", "singer2 (female)"],
    ["jp", "じゃ の め で お む か え う れ し い な", "0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.65", "60 60 60 62 64 67 69 69 64 64 64 62 60", "singer10 (female)"],
    ["jp", "お と め わ ら い か ふぁ い や ら い か ん な い す ぶ ろ うぃ ん ぶ ろ うぃ ん い ん ざ うぃ ん", "0.15 0.15 0.15 0.15 0.3 0.15 0.3 0.15 0.15 0.3 0.07 0.07 0.15 0.15 0.15 0.15 0.15 0.15 0.45 0.07 0.07 0.07 0.38 0.07 0.07 0.15 0.15 0.3 0.15 0.15", "67 67 67 67 67 67 69 67 67 69 67 67 64 64 64 64 64 64 62 64 64 62 62 64 64 62 62 59 59 59", "singer9 (male)"],
]

app = gr.Interface(
    fn=gen_song,
    inputs=[
        gr.Radio(label="language", choices=["zh", "jp"], value="zh"),
        gr.Textbox(label="Lyrics"),
        gr.Textbox(label="Duration"),
        gr.Textbox(label="Pitch"),
        gr.Radio(
            label="Singer",
            choices=[
                "singer1 (male)",
                "singer2 (female)",
                "singer3 (male)",
                "singer4 (female)",
                "singer4 (male)", 
                "singer6 (female)",
                "singer7 (male)",
                "singer8 (female)",
                "singer9 (male)",
                "singer10 (female)",
            ],
            value="singer1 (male)",
        ),
    ],
    outputs=[
        gr.Audio(label="Generated Song", type="numpy"),
        gr.Textbox(label="Running Status"),
    ],
    title=title,
    description=description,
    article=article,
    examples=examples,
)

app.launch()