Spaces:

TangRain
/

muskits-espnet-svs-demo

Running

App Files Files Community

muskits-espnet-svs-demo / app.py

TangRain

merge remote

e0646f6 9 months ago

raw

history blame

11.7 kB

	import os
	import numpy as np
	import gradio as gr
	import pyopenjtalk
	from util import preprocess_input, get_tokenizer, load_pitch_dict, get_pinyin

	from espnet_model_zoo.downloader import ModelDownloader
	from espnet2.fileio.read_text import read_label
	from espnet2.bin.svs_inference import SingingGenerate


	singer_embeddings = {
	"singer1 (male)": "resource/singer/singer_embedding_ace-1.npy",
	"singer2 (female)": "resource/singer/singer_embedding_ace-2.npy",
	"singer3 (male)": "resource/singer/singer_embedding_ace-3.npy",
	"singer4 (female)": "resource/singer/singer_embedding_ace-8.npy",
	"singer4 (male)": "resource/singer/singer_embedding_ace-7.npy",
	"singer6 (female)": "resource/singer/singer_embedding_itako.npy",
	"singer7 (male)": "resource/singer/singer_embedding_ofuton.npy",
	"singer8 (female)": "resource/singer/singer_embedding_kising_orange.npy",
	"singer9 (male)": "resource/singer/singer_embedding_m4singer_Tenor-1.npy",
	"singer10 (female)": "resource/singer/singer_embedding_m4singer_Alto-4.npy",
	}

	langs = {
	"zh": 2,
	"jp": 1,
	}

	def gen_song(lang, texts, durs, pitchs, spk):
	fs = 44100
	tempo = 120
	PRETRAIN_MODEL = "TangRain/mixdata_svs_visinger2_spkembed_lang_pretrained"
	# pretrain_downloaded = {
	# "train_config": "/data7/tyx/pretrained_model/mixdata_svs_visinger2_spkembed_lang_pretrained/exp/svs_train_visinger2_spk_embed_lang_raw_phn_None_mix/config.yaml",
	# "model_file": "/data7/tyx/pretrained_model/mixdata_svs_visinger2_spkembed_lang_pretrained/exp/svs_train_visinger2_spk_embed_lang_raw_phn_None_mix/500epoch.pth",
	# }
	if texts is None:
	return (fs, np.array([0.0])), "Error: No Text provided!"
	if durs is None:
	return (fs, np.array([0.0])), "Error: No Dur provided!"
	if pitchs is None:
	return (fs, np.array([0.0])), "Error: No Pitch provided!"

	# preprocess
	if lang == "zh":
	texts = preprocess_input(texts, "")
	text_list = get_pinyin(texts)
	elif lang == "jp":
	texts = preprocess_input(texts, " ")
	text_list = texts.strip().split()
	durs = preprocess_input(durs, " ")
	dur_list = durs.strip().split()
	pitchs = preprocess_input(pitchs, " ")
	pitch_list = pitchs.strip().split()

	if len(text_list) != len(dur_list):
	return (fs, np.array([0.0])), f"Error: len in text({len(text_list)}) mismatch with duration({len(dur_list)})!"
	if len(text_list) != len(pitch_list):
	return (fs, np.array([0.0])), f"Error: len in text({len(text_list)}) mismatch with pitch({len(pitch_list)})!"

	## text to phoneme
	tokenizer = get_tokenizer(lang)
	sybs = []
	for text in text_list:
	if text == "AP" or text == "SP":
	rev = [text]
	elif text == "-" or text == "——":
	rev = [text]
	else:
	rev = tokenizer(text)
	rev = [phn + f"@{lang}" for phn in rev]
	if rev == False:
	return (fs, np.array([0.0])), f"Error: text `{text}` is invalid!"
	phns = "_".join(rev)
	sybs.append(phns)

	pitch_dict = load_pitch_dict()

	labels = []
	notes = []
	st = 0
	pre_phn = ""
	for phns, dur, pitch in zip(sybs, dur_list, pitch_list):
	if phns == "-" or phns == "——":
	phns = pre_phn
	if pitch not in pitch_dict:
	return (fs, np.array([0.0])), f"Error: pitch `{pitch}` is invalid!"
	pitch = pitch_dict[pitch]
	phn_list = phns.split("_")
	lyric = "".join(phn_list)
	dur = float(dur)
	note = [st, st + dur, lyric, pitch, phns]
	st += dur
	notes.append(note)
	for phn in phn_list:
	labels.append(phn)
	pre_phn = labels[-1]

	phns_str = " ".join(labels)
	batch = {
	"score": (
	int(tempo),
	notes,
	),
	"text": phns_str,
	}
	# print(batch)
	# return (fs, np.array([0.0])), "success!"

	# Infer
	device = "cpu"
	# device = "cuda" if torch.cuda.is_available() else "cpu"
	d = ModelDownloader()
	pretrain_downloaded = d.download_and_unpack(PRETRAIN_MODEL)
	svs = SingingGenerate(
	train_config = pretrain_downloaded["train_config"],
	model_file = pretrain_downloaded["model_file"],
	device = device
	)
	# sid = spks[spk]
	lid = langs[lang]
	spk_embed = np.load(singer_embeddings[spk])
	# output_dict = svs(batch, sids=np.array([sid]))
	output_dict = svs(batch, lids=np.array([lid]), spembs=spk_embed)
	wav_info = output_dict["wav"].cpu().numpy()
	return (fs, wav_info), "success!"


	title = "Demo of Singing Voice Synthesis in Muskits-ESPnet"

	description = """
	<div style="font-size: 20px; ">
	<p>This is the demo page of our toolkit <b>Muskits-ESPnet: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm</b>.</p>
	<p>Singing Voice Synthesis (SVS) takes a music score as input and generates singing vocal with the voice of a specific singer.\n
	Music score contains information about lyrics, as well as duration and pitch of each word in lyrics.</p>

	<h1>How to use:</h1>
	<ol>
	<li> <b>Choose language ID</b>: "zh" indicates lyrics input in Chinese, and "jp" indicates lyrics input in Japanese. </li>
	<li> <b>Input lyrics</b>:
	<ul>
	<li> Lyrics use Chinese characters when the language is 'zh' and hiragana when the language is 'jp'. </li>
	<li> Special characters such as 'AP' (breath), 'SP' (silence), and '-' (slur, only for 'zh') can also be used. </li>
	<li> Lyrics sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
	</ul>
	</li>
	<li> <b>Input durations</b>:
	<ul>
	<li> Length of duration sequence should <b>be same as lyric sequence</b>, with each duration corresponding to the respective lyric. </li>
	<li> Durations sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
	</ul>
	</li>
	<li> <b>Input pitches</b>:
	<ul>
	<li> Length of pitch sequence should <b>be same as lyric sequence</b>, with each pitch corresponding to the respective lyric. </li>
	<li> Pitches sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
	</ul>
	</li>
	<li> <b>Choose one singer</b> </li>
	<li> <b>Click submit button</b> </li>
	</ol>

	<h1>Notice:</h1>
	<ul>
	<li> Values outside this range may result in suboptimal generation quality! </li>
	</ul>
	</div>
	"""

	article = """
	<div style='margin:20px auto;'>

	<p>References: <a href="https://arxiv.org/abs/2409.07226">Muskits-ESPnet paper</a> \|
	<a href="https://github.com/espnet/espnet">espnet GitHub</a> \|
	<a href="https://huggingface.co/espnet/mixdata_svs_visinger2_spkembed_lang_pretrained">pretrained model</a></p>

	<pre>
	@inproceedings{wu2024muskits,
	title = {{Muskits-ESPnet}: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm},
	author = {Yuning Wu and Jiatong Shi and Yifeng Yu and Yuxun Tang and Tao Qian and Yueqian Lin and Jionghao Han and Xinyi Bai and Shinji Watanabe and Qin Jin},
	booktitle={Proceedings of the 32st ACM International Conference on Multimedia},
	year={2024},
	}
	</pre>

	</div>
	"""


	# SP: silence, AP: aspirate.
	examples = [
	["zh", "雨淋湿了 SP 天空 AP\n毁的 SP 很讲究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "60 62 62 62 0 62 58 0\n58 58 0 58 58 63 0", "singer1 (male)"],
	["zh", "雨淋湿了 SP 天空 AP\n毁的 SP 很讲究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "C4 D4 D4 D4 rest D4 A#3 rest\nA#3 A#3 rest A#3 A#3 D#4 rest", "singer1 (male)"],
	["zh", "雨淋湿了 SP 天空 AP\n毁的 SP 很讲究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "C#4 D#4 D#4 D#4 rest D#4 B3 rest\nB3 B3 rest B3 B3 E4 rest", "singer1 (male)"],
	["zh", "你说你不 SP 懂\n 为何在这时牵手 AP", "0.11 0.33 0.29 0.13 0.15 0.48\n0.24 0.18 0.34 0.15 0.27 0.28 0.63 0.44", "63 63 63 63 0 63\n62 62 62 63 65 63 62 0", "singer1 (male)"],
	["zh", "你说你不 SP 懂\n 为何在这时牵手 AP", "0.23 0.66 0.58 0.27 0.3 0.97\n0.48 0.36 0.69 0.3 0.53 0.56 1.27 0.89", "63 63 63 63 0 63\n62 62 62 63 65 63 62 0", "singer1 (male)"],
	["zh", "雨淋湿了 SP 天空 AP\n毁的 SP 很讲究 AP\n你说你不 SP 懂\n 为何在这时牵手 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34\n0.11 0.33 0.29 0.13 0.15 0.48\n0.24 0.18 0.34 0.15 0.27 0.28 0.63 0.44", "60 62 62 62 0 62 58 0\n58 58 0 58 58 63 0\n63 63 63 63 0 63\n62 62 62 63 65 63 62 0", "singer1 (male)"],
	["zh", "修炼爱情的心酸 SP AP", "0.42 0.21 0.19 0.28 0.22 0.33 1.53 0.1 0.29", "68 70 68 66 63 68 68 0 0", "singer2 (female)"],
	["zh", "学会放好以前的渴望 SP AP", "0.3 0.22 0.29 0.27 0.25 0.44 0.54 0.29 1.03 0.08 0.39", "68 70 68 66 61 68 68 65 66 0 0", "singer2 (female)"],
	["zh", "SP 你看着车窗 - SP", " 0.41 0.96 0.7 0.64 1.12 1.14 1.04 0.29", "0 60 60 62 60 64 65 0", "singer3 (male)"],
	["jp", "いじんさんにつれられて", "0.6 0.3 0.3 0.3 0.3 0.6 0.6 0.3 0.3 0.6 0.23", "60 60 60 56 56 56 55 55 55 53 56", "singer8 (female)"],
	["jp", "いじんさんにつれられて", "0.6 0.3 0.3 0.3 0.3 0.6 0.6 0.3 0.3 0.6 0.23", "62 62 62 58 58 58 57 57 57 55 58", "singer8 (female)"],
	["jp", "いじんさんにつれられて", "1.2 0.6 0.6 0.6 0.6 1.2 1.2 0.6 0.6 1.2 0.45", "60 60 60 56 56 56 55 55 55 53 56", "singer8 (female)"],
	["jp", "いじんさんにつれられて", "0.3 0.15 0.15 0.15 0.15 0.3 0.3 0.15 0.15 0.3 0.11", "60 60 60 56 56 56 55 55 55 53 56", "singer8 (female)"],
	["jp", "きっととべばそらまでとどく AP", "0.39 2.76 0.2 0.2 0.39 0.39 0.2 0.2 0.39 0.2 0.2 0.59 1.08", "64 71 68 69 71 71 69 68 66 68 69 68 0", "singer2 (female)"],
	["jp", "じゃのめでおむかえうれしいな", "0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.65", "60 60 60 62 64 67 69 69 64 64 64 62 60", "singer10 (female)"],
	["jp", "おとめわらいかふぁいやらいかんないすぶろうぃんぶろうぃんいんざうぃん", "0.15 0.15 0.15 0.15 0.3 0.15 0.3 0.15 0.15 0.3 0.07 0.07 0.15 0.15 0.15 0.15 0.15 0.15 0.45 0.07 0.07 0.07 0.38 0.07 0.07 0.15 0.15 0.3 0.15 0.15", "67 67 67 67 67 67 69 67 67 69 67 67 64 64 64 64 64 64 62 64 64 62 62 64 64 62 62 59 59 59", "singer9 (male)"],
	]

	app = gr.Interface(
	fn=gen_song,
	inputs=[
	gr.Radio(label="language", choices=["zh", "jp"], value="zh"),
	gr.Textbox(label="Lyrics"),
	gr.Textbox(label="Duration"),
	gr.Textbox(label="Pitch"),
	gr.Radio(
	label="Singer",
	choices=[
	"singer1 (male)",
	"singer2 (female)",
	"singer3 (male)",
	"singer4 (female)",
	"singer4 (male)",
	"singer6 (female)",
	"singer7 (male)",
	"singer8 (female)",
	"singer9 (male)",
	"singer10 (female)",
	],
	value="singer1 (male)",
	),
	],
	outputs=[
	gr.Audio(label="Generated Song", type="numpy"),
	gr.Textbox(label="Running Status"),
	],
	title=title,
	description=description,
	article=article,
	examples=examples,
	)

	app.launch()