TangRain commited on
Commit
1324481
·
1 Parent(s): 903962c

v1: single model

Browse files
Files changed (2) hide show
  1. app.py +31 -10
  2. util.py +17 -1
app.py CHANGED
@@ -2,8 +2,7 @@ import os
2
  import numpy as np
3
  import gradio as gr
4
  import pyopenjtalk
5
- from pypinyin import lazy_pinyin
6
- from util import preprocess_input, get_tokenizer, load_pitch_dict
7
 
8
  from espnet_model_zoo.downloader import ModelDownloader
9
  from espnet2.fileio.read_text import read_label
@@ -46,7 +45,7 @@ def gen_song(lang, texts, durs, pitchs, spk):
46
  # preprocess
47
  if lang == "zh":
48
  texts = preprocess_input(texts, "")
49
- text_list = lazy_pinyin(texts)
50
  elif lang == "jp":
51
  texts = preprocess_input(texts, " ")
52
  text_list = texts.strip().split()
@@ -66,6 +65,8 @@ def gen_song(lang, texts, durs, pitchs, spk):
66
  for text in text_list:
67
  if text == "AP" or text == "SP":
68
  rev = [text]
 
 
69
  else:
70
  rev = tokenizer(text)
71
  rev = [phn + f"@{lang}" for phn in rev]
@@ -79,18 +80,22 @@ def gen_song(lang, texts, durs, pitchs, spk):
79
  labels = []
80
  notes = []
81
  st = 0
 
82
  for phns, dur, pitch in zip(sybs, dur_list, pitch_list):
 
 
83
  if pitch not in pitch_dict:
84
  return (fs, np.array([0.0])), f"Error: pitch `{pitch}` is invalid!"
85
  pitch = pitch_dict[pitch]
86
- dur = float(dur)
87
  phn_list = phns.split("_")
88
  lyric = "".join(phn_list)
 
89
  note = [st, st + dur, lyric, pitch, phns]
90
  st += dur
91
  notes.append(note)
92
  for phn in phn_list:
93
  labels.append(phn)
 
94
 
95
  phns_str = " ".join(labels)
96
  batch = {
@@ -125,16 +130,18 @@ def gen_song(lang, texts, durs, pitchs, spk):
125
  title = "Demo of Singing Voice Synthesis in Muskits-ESPnet"
126
 
127
  description = """
128
- <div style="font-size: 20px;">
129
- <p>This is the demo page of our toolkit <b>Muskits-ESPnet: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm</b>.</p>
130
  <p>Singing Voice Synthesis (SVS) takes a music score as input and generates singing vocal with the voice of a specific singer.
131
  Music score contains information about lyrics, as well as duration and pitch of each word in lyrics.</p>
132
 
133
- <p>How to use:</p>
134
  <ol>
135
  <li> <b>Choose language ID</b>: "zh" indicates lyrics input in Chinese, and "jp" indicates lyrics input in Japanese. </li>
136
  <li> <b>Input lyrics</b>:
137
  <ul>
 
 
138
  <li> Lyrics sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
139
  </ul>
140
  </li>
@@ -154,7 +161,10 @@ description = """
154
  <li> <b>Click submit button</b> </li>
155
  </ol>
156
 
157
- <b>Notice</b>: Values outside this range may result in suboptimal generation quality!
 
 
 
158
  </div>
159
  """
160
 
@@ -182,9 +192,20 @@ article = """
182
  examples = [
183
  ["zh", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "60 62 62 62 0 62 58 0\n58 58 0 58 58 63 0", "singer1 (male)"],
184
  ["zh", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "C4 D4 D4 D4 rest D4 A#3 rest\nA#3 A#3 rest A#3 A#3 D#4 rest", "singer1 (male)"],
185
- # ["zh", 89, "雨 淋 湿 了 SP 天 空 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "C4 D4 D4 D4 rest D4 A#3 rest", "singer2 (male)"],
186
- # ["zh", 89, " 湿 SP AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "C4 D4 D4 D4 rest D4 Bb3 rest", "singer3 (female)"],
 
 
 
 
 
 
 
 
 
187
  ["jp", "きっ と と べ ば そ ら ま で と ど く AP", "0.39 2.76 0.2 0.2 0.39 0.39 0.2 0.2 0.39 0.2 0.2 0.59 1.08", "64 71 68 69 71 71 69 68 66 68 69 68 0", "singer2 (female)"],
 
 
188
  ]
189
 
190
  app = gr.Interface(
 
2
  import numpy as np
3
  import gradio as gr
4
  import pyopenjtalk
5
+ from util import preprocess_input, get_tokenizer, load_pitch_dict, get_pinyin
 
6
 
7
  from espnet_model_zoo.downloader import ModelDownloader
8
  from espnet2.fileio.read_text import read_label
 
45
  # preprocess
46
  if lang == "zh":
47
  texts = preprocess_input(texts, "")
48
+ text_list = get_pinyin(texts)
49
  elif lang == "jp":
50
  texts = preprocess_input(texts, " ")
51
  text_list = texts.strip().split()
 
65
  for text in text_list:
66
  if text == "AP" or text == "SP":
67
  rev = [text]
68
+ elif text == "-" or text == "——":
69
+ rev = [text]
70
  else:
71
  rev = tokenizer(text)
72
  rev = [phn + f"@{lang}" for phn in rev]
 
80
  labels = []
81
  notes = []
82
  st = 0
83
+ pre_phn = ""
84
  for phns, dur, pitch in zip(sybs, dur_list, pitch_list):
85
+ if phns == "-" or phns == "——":
86
+ phns = pre_phn
87
  if pitch not in pitch_dict:
88
  return (fs, np.array([0.0])), f"Error: pitch `{pitch}` is invalid!"
89
  pitch = pitch_dict[pitch]
 
90
  phn_list = phns.split("_")
91
  lyric = "".join(phn_list)
92
+ dur = float(dur)
93
  note = [st, st + dur, lyric, pitch, phns]
94
  st += dur
95
  notes.append(note)
96
  for phn in phn_list:
97
  labels.append(phn)
98
+ pre_phn = labels[-1]
99
 
100
  phns_str = " ".join(labels)
101
  batch = {
 
130
  title = "Demo of Singing Voice Synthesis in Muskits-ESPnet"
131
 
132
  description = """
133
+ <div style="font-size: 20px;"
134
+ This is the demo page of our toolkit <b>Muskits-ESPnet: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm</b>.
135
  <p>Singing Voice Synthesis (SVS) takes a music score as input and generates singing vocal with the voice of a specific singer.
136
  Music score contains information about lyrics, as well as duration and pitch of each word in lyrics.</p>
137
 
138
+ <h1>How to use:</h1>
139
  <ol>
140
  <li> <b>Choose language ID</b>: "zh" indicates lyrics input in Chinese, and "jp" indicates lyrics input in Japanese. </li>
141
  <li> <b>Input lyrics</b>:
142
  <ul>
143
+ <li> Lyrics use Chinese characters when the language is 'zh' and hiragana when the language is 'jp'. </li>
144
+ <li> Special characters such as 'AP' (breath), 'SP' (silence), and '-' (slur, only for 'zh') can also be used. </li>
145
  <li> Lyrics sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
146
  </ul>
147
  </li>
 
161
  <li> <b>Click submit button</b> </li>
162
  </ol>
163
 
164
+ <h1>Notice:</h1>
165
+ <ul>
166
+ <li> Values outside this range may result in suboptimal generation quality! </li>
167
+ </ul>
168
  </div>
169
  """
170
 
 
192
  examples = [
193
  ["zh", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "60 62 62 62 0 62 58 0\n58 58 0 58 58 63 0", "singer1 (male)"],
194
  ["zh", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "C4 D4 D4 D4 rest D4 A#3 rest\nA#3 A#3 rest A#3 A#3 D#4 rest", "singer1 (male)"],
195
+ ["zh", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "C#4 D#4 D#4 D#4 rest D#4 B3 rest\nB3 B3 rest B3 B3 E4 rest", "singer1 (male)"],
196
+ ["zh", " SP 懂\n 何 在 这 时 牵 手 AP", "0.11 0.33 0.29 0.13 0.15 0.48\n0.24 0.18 0.34 0.15 0.27 0.28 0.63 0.44", "63 63 63 63 0 63\n62 62 62 63 65 63 62 0", "singer1 (male)"],
197
+ ["zh", "你 说 你 不 SP 懂\n 为 何 在 这 时 牵 手 AP", "0.23 0.66 0.58 0.27 0.3 0.97\n0.48 0.36 0.69 0.3 0.53 0.56 1.27 0.89", "63 63 63 63 0 63\n62 62 62 63 65 63 62 0", "singer1 (male)"],
198
+ ["zh", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP\n你 说 你 不 SP 懂\n 为 何 在 这 时 牵 手 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34\n0.11 0.33 0.29 0.13 0.15 0.48\n0.24 0.18 0.34 0.15 0.27 0.28 0.63 0.44", "60 62 62 62 0 62 58 0\n58 58 0 58 58 63 0\n63 63 63 63 0 63\n62 62 62 63 65 63 62 0", "singer1 (male)"],
199
+ ["zh", "修 炼 爱 情 的 心 酸 SP AP", "0.42 0.21 0.19 0.28 0.22 0.33 1.53 0.1 0.29", "68 70 68 66 63 68 68 0 0", "singer2 (female)"],
200
+ ["zh", "学 会 放 好 以 前 的 渴 望 SP AP", "0.3 0.22 0.29 0.27 0.25 0.44 0.54 0.29 1.03 0.08 0.39", "68 70 68 66 61 68 68 65 66 0 0", "singer2 (female)"],
201
+ ["zh", "SP 你 看 着 车 窗 - SP", " 0.41 0.96 0.7 0.64 1.12 1.14 1.04 0.29", "0 60 60 62 60 64 65 0", "singer3 (male)"],
202
+ ["jp", "い じ ん さ ん に つ れ ら れ て", "0.6 0.3 0.3 0.3 0.3 0.6 0.6 0.3 0.3 0.6 0.23", "60 60 60 56 56 56 55 55 55 53 56", "singer8 (female)"],
203
+ ["jp", "い じ ん さ ん に つ れ ら れ て", "0.6 0.3 0.3 0.3 0.3 0.6 0.6 0.3 0.3 0.6 0.23", "62 62 62 58 58 58 57 57 57 55 58", "singer8 (female)"],
204
+ ["jp", "い じ ん さ ん に つ れ ら れ て", "1.2 0.6 0.6 0.6 0.6 1.2 1.2 0.6 0.6 1.2 0.45", "60 60 60 56 56 56 55 55 55 53 56", "singer8 (female)"],
205
+ ["jp", "い じ ん さ ん に つ れ ら れ て", "0.3 0.15 0.15 0.15 0.15 0.3 0.3 0.15 0.15 0.3 0.11", "60 60 60 56 56 56 55 55 55 53 56", "singer8 (female)"],
206
  ["jp", "きっ と と べ ば そ ら ま で と ど く AP", "0.39 2.76 0.2 0.2 0.39 0.39 0.2 0.2 0.39 0.2 0.2 0.59 1.08", "64 71 68 69 71 71 69 68 66 68 69 68 0", "singer2 (female)"],
207
+ ["jp", "じゃ の め で お む か え う れ し い な", "0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.65", "60 60 60 62 64 67 69 69 64 64 64 62 60", "singer10 (female)"],
208
+ ["jp", "お と め わ ら い か ふぁ い や ら い か ん な い す ぶ ろ うぃ ん ぶ ろ うぃ ん い ん ざ うぃ ん", "0.15 0.15 0.15 0.15 0.3 0.15 0.3 0.15 0.15 0.3 0.07 0.07 0.15 0.15 0.15 0.15 0.15 0.15 0.45 0.07 0.07 0.07 0.38 0.07 0.07 0.15 0.15 0.3 0.15 0.15", "67 67 67 67 67 67 69 67 67 69 67 67 64 64 64 64 64 64 62 64 64 62 62 64 64 62 62 59 59 59", "singer9 (male)"],
209
  ]
210
 
211
  app = gr.Interface(
util.py CHANGED
@@ -2,6 +2,8 @@ import os
2
  import json
3
  import warnings
4
  from typing import List
 
 
5
 
6
  import pyopenjtalk
7
 
@@ -21,7 +23,7 @@ def pyopenjtalk_g2p(text) -> List[str]:
21
  for warning in w:
22
  if "No phoneme" in str(warning.message):
23
  return False
24
-
25
  phones = phones.split(" ")
26
  return phones
27
 
@@ -48,6 +50,20 @@ def get_tokenizer(lang):
48
  elif lang == "jp":
49
  return pyopenjtalk_g2p
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  def load_pitch_dict(file_path = "resource/midi-note.scp"):
52
  pitch_dict = {}
53
  with open(file_path, "r", encoding="utf-8") as f:
 
2
  import json
3
  import warnings
4
  from typing import List
5
+ from pypinyin import lazy_pinyin
6
+ import re
7
 
8
  import pyopenjtalk
9
 
 
23
  for warning in w:
24
  if "No phoneme" in str(warning.message):
25
  return False
26
+
27
  phones = phones.split(" ")
28
  return phones
29
 
 
50
  elif lang == "jp":
51
  return pyopenjtalk_g2p
52
 
53
+
54
+ def get_pinyin(texts):
55
+ pinyin_list = lazy_pinyin(texts)
56
+ text_list = []
57
+ for text in pinyin_list:
58
+ if text[0] == "S" or text[0] == "A" or text[0] == '-':
59
+ sp_strs = re.findall(r'-|AP|SP', text)
60
+ for phn in sp_strs:
61
+ text_list.append(phn)
62
+ else:
63
+ text_list.append(text)
64
+ return text_list
65
+
66
+
67
  def load_pitch_dict(file_path = "resource/midi-note.scp"):
68
  pitch_dict = {}
69
  with open(file_path, "r", encoding="utf-8") as f: