File size: 5,745 Bytes
9df835b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import numpy as np
import gradio as gr
from pypinyin import lazy_pinyin

from pinyin_dict import PINYIN_DICT

from espnet_model_zoo.downloader import ModelDownloader
from espnet2.fileio.read_text import read_label
from espnet2.bin.svs_inference import SingingGenerate


spks = {
    "singer1 (man)": 1,
    "singer2 (man)": 2,
    "singer3 (female)": 5,
    "singer4 (female)": 9,
    "singer5 (man)": 18,
    "singer6 (female)": 15,
    "singer7 (man)": 23,
    "singer8 (man)": 25,
    "singer9 (female)": 29,
    "singer10 (man)": 27,
}

def gen_song(lang, tempo, texts, durs, pitchs, spk):
    if lang == "zh":
        PRETRAIN_MODEL = "espnet/aceopencpop_svs_visinger2_40singer_pretrain"
        fs = 44100
        text_list = lazy_pinyin(texts)

    # preprocess
    if texts is None:
        return (fs, np.array([0.0])), "Error: No Text provided!"
    if durs is None:
        return (fs, np.array([0.0])), "Error: No Dur provided!"
    if pitchs is None:
        return (fs, np.array([0.0])), "Error: No Pitch provided!"

    dur_list = durs.strip().split()
    pitch_list = pitchs.strip().split()

    if len(text_list) != len(dur_list):
        return (fs, np.array([0.0])), f"Error: len in text({len(text_list)}) mismatch with duration({len(dur_list)})!"
    if len(text_list) != len(pitch_list):
        return (fs, np.array([0.0])), f"Error: len in text({len(text_list)}) mismatch with pitch({len(pitch_list)})!"

    ## text to phoneme
    sybs = []
    if lang == "zh":
        pinyin_dict = PINYIN_DICT
        for text in text_list:
            text = text.lower()
            if text not in pinyin_dict:
                return (fs, np.array([0.0])), f"Error: pinyin `{text}` is invalid!"
            phns = "_".join(pinyin_dict[text])
            sybs.append(phns)
            
    ## pitch
    pitch_dict = {}
    with open("./midi-note.scp", "r", encoding="utf-8") as f:
        for line in f:
            items = line.strip().split()
            pitch_dict[items[0]] = int(items[1])
            pitch_dict[items[1]] = int(items[1])

    labels = []
    notes = []
    st = 0
    for phns, dur, pitch in zip(sybs, dur_list, pitch_list):
        if pitch not in pitch_dict:
            return (fs, np.array([0.0])), f"Error: pitch `{pitch}` is invalid!"
        pitch = pitch_dict[pitch]
        dur = float(dur)
        phn_list = phns.split("_")
        lyric = "".join(phn_list)
        note = [st, st + dur, lyric, pitch, phns]
        st += dur
        notes.append(note)
        for phn in phn_list:
            labels.append(phn)

    phns_str = " ".join(labels)
    batch = {
        "score": (
            int(tempo),
            notes,
        ),
        "text": phns_str,
    }
    
    # Infer
    device = "cpu"
    # device = "cuda" if torch.cuda.is_available() else "cpu"
    d = ModelDownloader()
    pretrain_downloaded = d.download_and_unpack(PRETRAIN_MODEL)
    svs = SingingGenerate(
        train_config = pretrain_downloaded["train_config"],
        model_file = pretrain_downloaded["model_file"],
        device = device
    )
    sid = spks[spk]
    output_dict = svs(batch, sids=np.array([sid]))
    wav_info = output_dict["wav"].cpu().numpy()
    return (fs, wav_info), "success!"


title = "Demo of Singing Voice Synthesis in Muskits-ESPnet"

description = """
This is the demo page of our toolkit <b>Muskits-ESPnet: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm<b>.


<p>How to use:</p>
<ol>
  <li> Choose language ID. Language id </li>
  <li> Input tempo in integer </li>
  <li> Input text, duration, pitch of equal length </li>
  <li> Choose ons singer </li>
  <li> Click submit button </li>
</ol>


"""

article = """
<div style='margin:20px auto;'>

<p>References: <a href="https://arxiv.org/abs/2409.07226">Muskits-ESPnet paper</a> |
<a href="https://github.com/espnet/espnet">espnet GitHub</a> |
<a href="https://huggingface.co/espnet/aceopencpop_svs_visinger2_40singer_pretrain">pretrained model</a></p>

<pre>
@inproceedings{wu2024muskits,
  title = {{Muskits-ESPnet}: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm},
  author = {Yuning Wu and Jiatong Shi and Yifeng Yu and Yuxun Tang and Tao Qian and Yueqian Lin and Jionghao Han and Xinyi Bai and Shinji Watanabe and Qin Jin},
  booktitle={Proc. ACM Multimedia},
  year={2024},
}
</pre>

</div>
"""


# SP: silence, AP: aspirate.
examples = [
    ["zh", 89, "雨淋湿了SP天空AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "60 62 62 62 0 62 58 0", "singer1 (man)"],
    ["zh", 89, "雨淋湿了SP天空AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "C4 D4 D4 D4 rest D4 A#3 rest", "singer2 (man)"],
    ["zh", 89, "雨淋湿了SP天空AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "C4 D4 D4 D4 rest D4 Bb3 rest", "singer3 (female)"],
]

gr.Interface(
    fn=gen_song,
    inputs=[
        gr.Radio(label="language", choices=["zh"], value="zh"),
        gr.Textbox(label="Tempo"),
        gr.Textbox(label="Text"),
        gr.Textbox(label="Duration"),
        gr.Textbox(label="Pitch"),
        gr.Radio(
            label="Singer",
            choices=[
                "singer1 (man)",
                "singer2 (man)",
                "singer3 (female)",
                "singer4 (female)",
                "singer5 (man)",
                "singer6 (female)",
                "singer7 (man)",
                "singer8 (man)",
                "singer9 (female)",
                "singer10 (man)",
            ],
            value="singer1 (man)"
        ),
    ],
    outputs=[
        gr.Audio(label="Generated Song", type="numpy"),
        gr.Textbox(label="Running Status"),
    ],
    title=title,
    description=description,
    article=article,
    examples=examples,
).launch()