Spaces:

TangRain
/

muskits-espnet-svs-demo

Running

App Files Files Community

muskits-espnet-svs-demo / app.py

TangRain

update app.py

903962c 9 months ago

raw

history blame

8.84 kB

	import os
	import numpy as np
	import gradio as gr
	import pyopenjtalk
	from pypinyin import lazy_pinyin
	from util import preprocess_input, get_tokenizer, load_pitch_dict

	from espnet_model_zoo.downloader import ModelDownloader
	from espnet2.fileio.read_text import read_label
	from espnet2.bin.svs_inference import SingingGenerate


	singer_embeddings = {
	"singer1 (male)": "resource/singer/singer_embedding_ace-1.npy",
	"singer2 (female)": "resource/singer/singer_embedding_ace-2.npy",
	"singer3 (male)": "resource/singer/singer_embedding_ace-3.npy",
	"singer4 (female)": "resource/singer/singer_embedding_ace-8.npy",
	"singer4 (male)": "resource/singer/singer_embedding_ace-7.npy",
	"singer6 (female)": "resource/singer/singer_embedding_itako.npy",
	"singer7 (male)": "resource/singer/singer_embedding_ofuton.npy",
	"singer8 (female)": "resource/singer/singer_embedding_kising_orange.npy",
	"singer9 (male)": "resource/singer/singer_embedding_m4singer_Tenor-1.npy",
	"singer10 (female)": "resource/singer/singer_embedding_m4singer_Alto-4.npy",
	}

	langs = {
	"zh": 2,
	"jp": 1,
	}

	def gen_song(lang, texts, durs, pitchs, spk):
	fs = 44100
	tempo = 120
	PRETRAIN_MODEL = "TangRain/mixdata_svs_visinger2_spkembed_lang_pretrained"
	# pretrain_downloaded = {
	# "train_config": "/data7/tyx/pretrained_model/mixdata_svs_visinger2_spkembed_lang_pretrained/exp/svs_train_visinger2_spk_embed_lang_raw_phn_None_mix/config.yaml",
	# "model_file": "/data7/tyx/pretrained_model/mixdata_svs_visinger2_spkembed_lang_pretrained/exp/svs_train_visinger2_spk_embed_lang_raw_phn_None_mix/500epoch.pth",
	# }
	if texts is None:
	return (fs, np.array([0.0])), "Error: No Text provided!"
	if durs is None:
	return (fs, np.array([0.0])), "Error: No Dur provided!"
	if pitchs is None:
	return (fs, np.array([0.0])), "Error: No Pitch provided!"

	# preprocess
	if lang == "zh":
	texts = preprocess_input(texts, "")
	text_list = lazy_pinyin(texts)
	elif lang == "jp":
	texts = preprocess_input(texts, " ")
	text_list = texts.strip().split()
	durs = preprocess_input(durs, " ")
	dur_list = durs.strip().split()
	pitchs = preprocess_input(pitchs, " ")
	pitch_list = pitchs.strip().split()

	if len(text_list) != len(dur_list):
	return (fs, np.array([0.0])), f"Error: len in text({len(text_list)}) mismatch with duration({len(dur_list)})!"
	if len(text_list) != len(pitch_list):
	return (fs, np.array([0.0])), f"Error: len in text({len(text_list)}) mismatch with pitch({len(pitch_list)})!"

	## text to phoneme
	tokenizer = get_tokenizer(lang)
	sybs = []
	for text in text_list:
	if text == "AP" or text == "SP":
	rev = [text]
	else:
	rev = tokenizer(text)
	rev = [phn + f"@{lang}" for phn in rev]
	if rev == False:
	return (fs, np.array([0.0])), f"Error: text `{text}` is invalid!"
	phns = "_".join(rev)
	sybs.append(phns)

	pitch_dict = load_pitch_dict()

	labels = []
	notes = []
	st = 0
	for phns, dur, pitch in zip(sybs, dur_list, pitch_list):
	if pitch not in pitch_dict:
	return (fs, np.array([0.0])), f"Error: pitch `{pitch}` is invalid!"
	pitch = pitch_dict[pitch]
	dur = float(dur)
	phn_list = phns.split("_")
	lyric = "".join(phn_list)
	note = [st, st + dur, lyric, pitch, phns]
	st += dur
	notes.append(note)
	for phn in phn_list:
	labels.append(phn)

	phns_str = " ".join(labels)
	batch = {
	"score": (
	int(tempo),
	notes,
	),
	"text": phns_str,
	}
	# print(batch)
	# return (fs, np.array([0.0])), "success!"

	# Infer
	device = "cpu"
	# device = "cuda" if torch.cuda.is_available() else "cpu"
	d = ModelDownloader()
	pretrain_downloaded = d.download_and_unpack(PRETRAIN_MODEL)
	svs = SingingGenerate(
	train_config = pretrain_downloaded["train_config"],
	model_file = pretrain_downloaded["model_file"],
	device = device
	)
	# sid = spks[spk]
	lid = langs[lang]
	spk_embed = np.load(singer_embeddings[spk])
	# output_dict = svs(batch, sids=np.array([sid]))
	output_dict = svs(batch, lids=np.array([lid]), spembs=spk_embed)
	wav_info = output_dict["wav"].cpu().numpy()
	return (fs, wav_info), "success!"


	title = "Demo of Singing Voice Synthesis in Muskits-ESPnet"

	description = """
	<div style="font-size: 20px;">
	<p>This is the demo page of our toolkit <b>Muskits-ESPnet: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm</b>.</p>
	<p>Singing Voice Synthesis (SVS) takes a music score as input and generates singing vocal with the voice of a specific singer.
	Music score contains information about lyrics, as well as duration and pitch of each word in lyrics.</p>

	<p>How to use:</p>
	<ol>
	<li> <b>Choose language ID</b>: "zh" indicates lyrics input in Chinese, and "jp" indicates lyrics input in Japanese. </li>
	<li> <b>Input lyrics</b>:
	<ul>
	<li> Lyrics sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
	</ul>
	</li>
	<li> <b>Input durations</b>:
	<ul>
	<li> Length of duration sequence should <b>be same as lyric sequence</b>, with each duration corresponding to the respective lyric. </li>
	<li> Durations sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
	</ul>
	</li>
	<li> <b>Input pitches</b>:
	<ul>
	<li> Length of pitch sequence should <b>be same as lyric sequence</b>, with each pitch corresponding to the respective lyric. </li>
	<li> Pitches sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
	</ul>
	</li>
	<li> <b>Choose one singer</b> </li>
	<li> <b>Click submit button</b> </li>
	</ol>

	<b>Notice</b>: Values outside this range may result in suboptimal generation quality!
	</div>
	"""

	article = """
	<div style='margin:20px auto;'>

	<p>References: <a href="https://arxiv.org/abs/2409.07226">Muskits-ESPnet paper</a> \|
	<a href="https://github.com/espnet/espnet">espnet GitHub</a> \|
	<a href="https://huggingface.co/espnet/mixdata_svs_visinger2_spkembed_lang_pretrained">pretrained model</a></p>

	<pre>
	@inproceedings{wu2024muskits,
	title = {{Muskits-ESPnet}: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm},
	author = {Yuning Wu and Jiatong Shi and Yifeng Yu and Yuxun Tang and Tao Qian and Yueqian Lin and Jionghao Han and Xinyi Bai and Shinji Watanabe and Qin Jin},
	booktitle={Proceedings of the 32st ACM International Conference on Multimedia},
	year={2024},
	}
	</pre>

	</div>
	"""


	# SP: silence, AP: aspirate.
	examples = [
	["zh", "雨淋湿了 SP 天空 AP\n毁的 SP 很讲究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "60 62 62 62 0 62 58 0\n58 58 0 58 58 63 0", "singer1 (male)"],
	["zh", "雨淋湿了 SP 天空 AP\n毁的 SP 很讲究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "C4 D4 D4 D4 rest D4 A#3 rest\nA#3 A#3 rest A#3 A#3 D#4 rest", "singer1 (male)"],
	# ["zh", 89, "雨淋湿了 SP 天空 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "C4 D4 D4 D4 rest D4 A#3 rest", "singer2 (male)"],
	# ["zh", 89, "雨淋湿了 SP 天空 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "C4 D4 D4 D4 rest D4 Bb3 rest", "singer3 (female)"],
	["jp", "きっととべばそらまでとどく AP", "0.39 2.76 0.2 0.2 0.39 0.39 0.2 0.2 0.39 0.2 0.2 0.59 1.08", "64 71 68 69 71 71 69 68 66 68 69 68 0", "singer2 (female)"],
	]

	app = gr.Interface(
	fn=gen_song,
	inputs=[
	gr.Radio(label="language", choices=["zh", "jp"], value="zh"),
	gr.Textbox(label="Lyrics"),
	gr.Textbox(label="Duration"),
	gr.Textbox(label="Pitch"),
	gr.Radio(
	label="Singer",
	choices=[
	"singer1 (male)",
	"singer2 (female)",
	"singer3 (male)",
	"singer4 (female)",
	"singer4 (male)",
	"singer6 (female)",
	"singer7 (male)",
	"singer8 (female)",
	"singer9 (male)",
	"singer10 (female)",
	],
	value="singer1 (male)",
	),
	],
	outputs=[
	gr.Audio(label="Generated Song", type="numpy"),
	gr.Textbox(label="Running Status"),
	],
	title=title,
	description=description,
	article=article,
	examples=examples,
	)

	app.launch()