v1: single model
Browse files
app.py
CHANGED
@@ -2,8 +2,7 @@ import os
|
|
2 |
import numpy as np
|
3 |
import gradio as gr
|
4 |
import pyopenjtalk
|
5 |
-
from
|
6 |
-
from util import preprocess_input, get_tokenizer, load_pitch_dict
|
7 |
|
8 |
from espnet_model_zoo.downloader import ModelDownloader
|
9 |
from espnet2.fileio.read_text import read_label
|
@@ -46,7 +45,7 @@ def gen_song(lang, texts, durs, pitchs, spk):
|
|
46 |
# preprocess
|
47 |
if lang == "zh":
|
48 |
texts = preprocess_input(texts, "")
|
49 |
-
text_list =
|
50 |
elif lang == "jp":
|
51 |
texts = preprocess_input(texts, " ")
|
52 |
text_list = texts.strip().split()
|
@@ -66,6 +65,8 @@ def gen_song(lang, texts, durs, pitchs, spk):
|
|
66 |
for text in text_list:
|
67 |
if text == "AP" or text == "SP":
|
68 |
rev = [text]
|
|
|
|
|
69 |
else:
|
70 |
rev = tokenizer(text)
|
71 |
rev = [phn + f"@{lang}" for phn in rev]
|
@@ -79,18 +80,22 @@ def gen_song(lang, texts, durs, pitchs, spk):
|
|
79 |
labels = []
|
80 |
notes = []
|
81 |
st = 0
|
|
|
82 |
for phns, dur, pitch in zip(sybs, dur_list, pitch_list):
|
|
|
|
|
83 |
if pitch not in pitch_dict:
|
84 |
return (fs, np.array([0.0])), f"Error: pitch `{pitch}` is invalid!"
|
85 |
pitch = pitch_dict[pitch]
|
86 |
-
dur = float(dur)
|
87 |
phn_list = phns.split("_")
|
88 |
lyric = "".join(phn_list)
|
|
|
89 |
note = [st, st + dur, lyric, pitch, phns]
|
90 |
st += dur
|
91 |
notes.append(note)
|
92 |
for phn in phn_list:
|
93 |
labels.append(phn)
|
|
|
94 |
|
95 |
phns_str = " ".join(labels)
|
96 |
batch = {
|
@@ -125,16 +130,18 @@ def gen_song(lang, texts, durs, pitchs, spk):
|
|
125 |
title = "Demo of Singing Voice Synthesis in Muskits-ESPnet"
|
126 |
|
127 |
description = """
|
128 |
-
<div style="font-size: 20px;"
|
129 |
-
|
130 |
<p>Singing Voice Synthesis (SVS) takes a music score as input and generates singing vocal with the voice of a specific singer.
|
131 |
Music score contains information about lyrics, as well as duration and pitch of each word in lyrics.</p>
|
132 |
|
133 |
-
<
|
134 |
<ol>
|
135 |
<li> <b>Choose language ID</b>: "zh" indicates lyrics input in Chinese, and "jp" indicates lyrics input in Japanese. </li>
|
136 |
<li> <b>Input lyrics</b>:
|
137 |
<ul>
|
|
|
|
|
138 |
<li> Lyrics sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
|
139 |
</ul>
|
140 |
</li>
|
@@ -154,7 +161,10 @@ description = """
|
|
154 |
<li> <b>Click submit button</b> </li>
|
155 |
</ol>
|
156 |
|
157 |
-
<
|
|
|
|
|
|
|
158 |
</div>
|
159 |
"""
|
160 |
|
@@ -182,9 +192,20 @@ article = """
|
|
182 |
examples = [
|
183 |
["zh", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "60 62 62 62 0 62 58 0\n58 58 0 58 58 63 0", "singer1 (male)"],
|
184 |
["zh", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "C4 D4 D4 D4 rest D4 A#3 rest\nA#3 A#3 rest A#3 A#3 D#4 rest", "singer1 (male)"],
|
185 |
-
|
186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
["jp", "きっ と と べ ば そ ら ま で と ど く AP", "0.39 2.76 0.2 0.2 0.39 0.39 0.2 0.2 0.39 0.2 0.2 0.59 1.08", "64 71 68 69 71 71 69 68 66 68 69 68 0", "singer2 (female)"],
|
|
|
|
|
188 |
]
|
189 |
|
190 |
app = gr.Interface(
|
|
|
2 |
import numpy as np
|
3 |
import gradio as gr
|
4 |
import pyopenjtalk
|
5 |
+
from util import preprocess_input, get_tokenizer, load_pitch_dict, get_pinyin
|
|
|
6 |
|
7 |
from espnet_model_zoo.downloader import ModelDownloader
|
8 |
from espnet2.fileio.read_text import read_label
|
|
|
45 |
# preprocess
|
46 |
if lang == "zh":
|
47 |
texts = preprocess_input(texts, "")
|
48 |
+
text_list = get_pinyin(texts)
|
49 |
elif lang == "jp":
|
50 |
texts = preprocess_input(texts, " ")
|
51 |
text_list = texts.strip().split()
|
|
|
65 |
for text in text_list:
|
66 |
if text == "AP" or text == "SP":
|
67 |
rev = [text]
|
68 |
+
elif text == "-" or text == "——":
|
69 |
+
rev = [text]
|
70 |
else:
|
71 |
rev = tokenizer(text)
|
72 |
rev = [phn + f"@{lang}" for phn in rev]
|
|
|
80 |
labels = []
|
81 |
notes = []
|
82 |
st = 0
|
83 |
+
pre_phn = ""
|
84 |
for phns, dur, pitch in zip(sybs, dur_list, pitch_list):
|
85 |
+
if phns == "-" or phns == "——":
|
86 |
+
phns = pre_phn
|
87 |
if pitch not in pitch_dict:
|
88 |
return (fs, np.array([0.0])), f"Error: pitch `{pitch}` is invalid!"
|
89 |
pitch = pitch_dict[pitch]
|
|
|
90 |
phn_list = phns.split("_")
|
91 |
lyric = "".join(phn_list)
|
92 |
+
dur = float(dur)
|
93 |
note = [st, st + dur, lyric, pitch, phns]
|
94 |
st += dur
|
95 |
notes.append(note)
|
96 |
for phn in phn_list:
|
97 |
labels.append(phn)
|
98 |
+
pre_phn = labels[-1]
|
99 |
|
100 |
phns_str = " ".join(labels)
|
101 |
batch = {
|
|
|
130 |
title = "Demo of Singing Voice Synthesis in Muskits-ESPnet"
|
131 |
|
132 |
description = """
|
133 |
+
<div style="font-size: 20px;"
|
134 |
+
This is the demo page of our toolkit <b>Muskits-ESPnet: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm</b>.
|
135 |
<p>Singing Voice Synthesis (SVS) takes a music score as input and generates singing vocal with the voice of a specific singer.
|
136 |
Music score contains information about lyrics, as well as duration and pitch of each word in lyrics.</p>
|
137 |
|
138 |
+
<h1>How to use:</h1>
|
139 |
<ol>
|
140 |
<li> <b>Choose language ID</b>: "zh" indicates lyrics input in Chinese, and "jp" indicates lyrics input in Japanese. </li>
|
141 |
<li> <b>Input lyrics</b>:
|
142 |
<ul>
|
143 |
+
<li> Lyrics use Chinese characters when the language is 'zh' and hiragana when the language is 'jp'. </li>
|
144 |
+
<li> Special characters such as 'AP' (breath), 'SP' (silence), and '-' (slur, only for 'zh') can also be used. </li>
|
145 |
<li> Lyrics sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
|
146 |
</ul>
|
147 |
</li>
|
|
|
161 |
<li> <b>Click submit button</b> </li>
|
162 |
</ol>
|
163 |
|
164 |
+
<h1>Notice:</h1>
|
165 |
+
<ul>
|
166 |
+
<li> Values outside this range may result in suboptimal generation quality! </li>
|
167 |
+
</ul>
|
168 |
</div>
|
169 |
"""
|
170 |
|
|
|
192 |
examples = [
|
193 |
["zh", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "60 62 62 62 0 62 58 0\n58 58 0 58 58 63 0", "singer1 (male)"],
|
194 |
["zh", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "C4 D4 D4 D4 rest D4 A#3 rest\nA#3 A#3 rest A#3 A#3 D#4 rest", "singer1 (male)"],
|
195 |
+
["zh", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "C#4 D#4 D#4 D#4 rest D#4 B3 rest\nB3 B3 rest B3 B3 E4 rest", "singer1 (male)"],
|
196 |
+
["zh", "你 说 你 不 SP 懂\n 为 何 在 这 时 牵 手 AP", "0.11 0.33 0.29 0.13 0.15 0.48\n0.24 0.18 0.34 0.15 0.27 0.28 0.63 0.44", "63 63 63 63 0 63\n62 62 62 63 65 63 62 0", "singer1 (male)"],
|
197 |
+
["zh", "你 说 你 不 SP 懂\n 为 何 在 这 时 牵 手 AP", "0.23 0.66 0.58 0.27 0.3 0.97\n0.48 0.36 0.69 0.3 0.53 0.56 1.27 0.89", "63 63 63 63 0 63\n62 62 62 63 65 63 62 0", "singer1 (male)"],
|
198 |
+
["zh", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP\n你 说 你 不 SP 懂\n 为 何 在 这 时 牵 手 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34\n0.11 0.33 0.29 0.13 0.15 0.48\n0.24 0.18 0.34 0.15 0.27 0.28 0.63 0.44", "60 62 62 62 0 62 58 0\n58 58 0 58 58 63 0\n63 63 63 63 0 63\n62 62 62 63 65 63 62 0", "singer1 (male)"],
|
199 |
+
["zh", "修 炼 爱 情 的 心 酸 SP AP", "0.42 0.21 0.19 0.28 0.22 0.33 1.53 0.1 0.29", "68 70 68 66 63 68 68 0 0", "singer2 (female)"],
|
200 |
+
["zh", "学 会 放 好 以 前 的 渴 望 SP AP", "0.3 0.22 0.29 0.27 0.25 0.44 0.54 0.29 1.03 0.08 0.39", "68 70 68 66 61 68 68 65 66 0 0", "singer2 (female)"],
|
201 |
+
["zh", "SP 你 看 着 车 窗 - SP", " 0.41 0.96 0.7 0.64 1.12 1.14 1.04 0.29", "0 60 60 62 60 64 65 0", "singer3 (male)"],
|
202 |
+
["jp", "い じ ん さ ん に つ れ ら れ て", "0.6 0.3 0.3 0.3 0.3 0.6 0.6 0.3 0.3 0.6 0.23", "60 60 60 56 56 56 55 55 55 53 56", "singer8 (female)"],
|
203 |
+
["jp", "い じ ん さ ん に つ れ ら れ て", "0.6 0.3 0.3 0.3 0.3 0.6 0.6 0.3 0.3 0.6 0.23", "62 62 62 58 58 58 57 57 57 55 58", "singer8 (female)"],
|
204 |
+
["jp", "い じ ん さ ん に つ れ ら れ て", "1.2 0.6 0.6 0.6 0.6 1.2 1.2 0.6 0.6 1.2 0.45", "60 60 60 56 56 56 55 55 55 53 56", "singer8 (female)"],
|
205 |
+
["jp", "い じ ん さ ん に つ れ ら れ て", "0.3 0.15 0.15 0.15 0.15 0.3 0.3 0.15 0.15 0.3 0.11", "60 60 60 56 56 56 55 55 55 53 56", "singer8 (female)"],
|
206 |
["jp", "きっ と と べ ば そ ら ま で と ど く AP", "0.39 2.76 0.2 0.2 0.39 0.39 0.2 0.2 0.39 0.2 0.2 0.59 1.08", "64 71 68 69 71 71 69 68 66 68 69 68 0", "singer2 (female)"],
|
207 |
+
["jp", "じゃ の め で お む か え う れ し い な", "0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.65", "60 60 60 62 64 67 69 69 64 64 64 62 60", "singer10 (female)"],
|
208 |
+
["jp", "お と め わ ら い か ふぁ い や ら い か ん な い す ぶ ろ うぃ ん ぶ ろ うぃ ん い ん ざ うぃ ん", "0.15 0.15 0.15 0.15 0.3 0.15 0.3 0.15 0.15 0.3 0.07 0.07 0.15 0.15 0.15 0.15 0.15 0.15 0.45 0.07 0.07 0.07 0.38 0.07 0.07 0.15 0.15 0.3 0.15 0.15", "67 67 67 67 67 67 69 67 67 69 67 67 64 64 64 64 64 64 62 64 64 62 62 64 64 62 62 59 59 59", "singer9 (male)"],
|
209 |
]
|
210 |
|
211 |
app = gr.Interface(
|
util.py
CHANGED
@@ -2,6 +2,8 @@ import os
|
|
2 |
import json
|
3 |
import warnings
|
4 |
from typing import List
|
|
|
|
|
5 |
|
6 |
import pyopenjtalk
|
7 |
|
@@ -21,7 +23,7 @@ def pyopenjtalk_g2p(text) -> List[str]:
|
|
21 |
for warning in w:
|
22 |
if "No phoneme" in str(warning.message):
|
23 |
return False
|
24 |
-
|
25 |
phones = phones.split(" ")
|
26 |
return phones
|
27 |
|
@@ -48,6 +50,20 @@ def get_tokenizer(lang):
|
|
48 |
elif lang == "jp":
|
49 |
return pyopenjtalk_g2p
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
def load_pitch_dict(file_path = "resource/midi-note.scp"):
|
52 |
pitch_dict = {}
|
53 |
with open(file_path, "r", encoding="utf-8") as f:
|
|
|
2 |
import json
|
3 |
import warnings
|
4 |
from typing import List
|
5 |
+
from pypinyin import lazy_pinyin
|
6 |
+
import re
|
7 |
|
8 |
import pyopenjtalk
|
9 |
|
|
|
23 |
for warning in w:
|
24 |
if "No phoneme" in str(warning.message):
|
25 |
return False
|
26 |
+
|
27 |
phones = phones.split(" ")
|
28 |
return phones
|
29 |
|
|
|
50 |
elif lang == "jp":
|
51 |
return pyopenjtalk_g2p
|
52 |
|
53 |
+
|
54 |
+
def get_pinyin(texts):
|
55 |
+
pinyin_list = lazy_pinyin(texts)
|
56 |
+
text_list = []
|
57 |
+
for text in pinyin_list:
|
58 |
+
if text[0] == "S" or text[0] == "A" or text[0] == '-':
|
59 |
+
sp_strs = re.findall(r'-|AP|SP', text)
|
60 |
+
for phn in sp_strs:
|
61 |
+
text_list.append(phn)
|
62 |
+
else:
|
63 |
+
text_list.append(text)
|
64 |
+
return text_list
|
65 |
+
|
66 |
+
|
67 |
def load_pitch_dict(file_path = "resource/midi-note.scp"):
|
68 |
pitch_dict = {}
|
69 |
with open(file_path, "r", encoding="utf-8") as f:
|