Spaces:

TangRain
/

muskits-espnet-svs-demo

Running

App Files Files Community

muskits-espnet-svs-demo / app.py

TangRain

feat(demo-v1): support Chinese song with pretrained VISinger2

9df835b 5 months ago

raw

history blame

5.75 kB

	import numpy as np
	import gradio as gr
	from pypinyin import lazy_pinyin

	from pinyin_dict import PINYIN_DICT

	from espnet_model_zoo.downloader import ModelDownloader
	from espnet2.fileio.read_text import read_label
	from espnet2.bin.svs_inference import SingingGenerate


	spks = {
	"singer1 (man)": 1,
	"singer2 (man)": 2,
	"singer3 (female)": 5,
	"singer4 (female)": 9,
	"singer5 (man)": 18,
	"singer6 (female)": 15,
	"singer7 (man)": 23,
	"singer8 (man)": 25,
	"singer9 (female)": 29,
	"singer10 (man)": 27,
	}

	def gen_song(lang, tempo, texts, durs, pitchs, spk):
	if lang == "zh":
	PRETRAIN_MODEL = "espnet/aceopencpop_svs_visinger2_40singer_pretrain"
	fs = 44100
	text_list = lazy_pinyin(texts)

	# preprocess
	if texts is None:
	return (fs, np.array([0.0])), "Error: No Text provided!"
	if durs is None:
	return (fs, np.array([0.0])), "Error: No Dur provided!"
	if pitchs is None:
	return (fs, np.array([0.0])), "Error: No Pitch provided!"

	dur_list = durs.strip().split()
	pitch_list = pitchs.strip().split()

	if len(text_list) != len(dur_list):
	return (fs, np.array([0.0])), f"Error: len in text({len(text_list)}) mismatch with duration({len(dur_list)})!"
	if len(text_list) != len(pitch_list):
	return (fs, np.array([0.0])), f"Error: len in text({len(text_list)}) mismatch with pitch({len(pitch_list)})!"

	## text to phoneme
	sybs = []
	if lang == "zh":
	pinyin_dict = PINYIN_DICT
	for text in text_list:
	text = text.lower()
	if text not in pinyin_dict:
	return (fs, np.array([0.0])), f"Error: pinyin `{text}` is invalid!"
	phns = "_".join(pinyin_dict[text])
	sybs.append(phns)

	## pitch
	pitch_dict = {}
	with open("./midi-note.scp", "r", encoding="utf-8") as f:
	for line in f:
	items = line.strip().split()
	pitch_dict[items[0]] = int(items[1])
	pitch_dict[items[1]] = int(items[1])

	labels = []
	notes = []
	st = 0
	for phns, dur, pitch in zip(sybs, dur_list, pitch_list):
	if pitch not in pitch_dict:
	return (fs, np.array([0.0])), f"Error: pitch `{pitch}` is invalid!"
	pitch = pitch_dict[pitch]
	dur = float(dur)
	phn_list = phns.split("_")
	lyric = "".join(phn_list)
	note = [st, st + dur, lyric, pitch, phns]
	st += dur
	notes.append(note)
	for phn in phn_list:
	labels.append(phn)

	phns_str = " ".join(labels)
	batch = {
	"score": (
	int(tempo),
	notes,
	),
	"text": phns_str,
	}

	# Infer
	device = "cpu"
	# device = "cuda" if torch.cuda.is_available() else "cpu"
	d = ModelDownloader()
	pretrain_downloaded = d.download_and_unpack(PRETRAIN_MODEL)
	svs = SingingGenerate(
	train_config = pretrain_downloaded["train_config"],
	model_file = pretrain_downloaded["model_file"],
	device = device
	)
	sid = spks[spk]
	output_dict = svs(batch, sids=np.array([sid]))
	wav_info = output_dict["wav"].cpu().numpy()
	return (fs, wav_info), "success!"


	title = "Demo of Singing Voice Synthesis in Muskits-ESPnet"

	description = """
	This is the demo page of our toolkit <b>Muskits-ESPnet: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm<b>.


	<p>How to use:</p>
	<ol>
	<li> Choose language ID. Language id </li>
	<li> Input tempo in integer </li>
	<li> Input text, duration, pitch of equal length </li>
	<li> Choose ons singer </li>
	<li> Click submit button </li>
	</ol>


	"""

	article = """
	<div style='margin:20px auto;'>

	<p>References: <a href="https://arxiv.org/abs/2409.07226">Muskits-ESPnet paper</a> \|
	<a href="https://github.com/espnet/espnet">espnet GitHub</a> \|
	<a href="https://huggingface.co/espnet/aceopencpop_svs_visinger2_40singer_pretrain">pretrained model</a></p>

	<pre>
	@inproceedings{wu2024muskits,
	title = {{Muskits-ESPnet}: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm},
	author = {Yuning Wu and Jiatong Shi and Yifeng Yu and Yuxun Tang and Tao Qian and Yueqian Lin and Jionghao Han and Xinyi Bai and Shinji Watanabe and Qin Jin},
	booktitle={Proc. ACM Multimedia},
	year={2024},
	}
	</pre>

	</div>
	"""


	# SP: silence, AP: aspirate.
	examples = [
	["zh", 89, "雨淋湿了SP天空AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "60 62 62 62 0 62 58 0", "singer1 (man)"],
	["zh", 89, "雨淋湿了SP天空AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "C4 D4 D4 D4 rest D4 A#3 rest", "singer2 (man)"],
	["zh", 89, "雨淋湿了SP天空AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "C4 D4 D4 D4 rest D4 Bb3 rest", "singer3 (female)"],
	]

	gr.Interface(
	fn=gen_song,
	inputs=[
	gr.Radio(label="language", choices=["zh"], value="zh"),
	gr.Textbox(label="Tempo"),
	gr.Textbox(label="Text"),
	gr.Textbox(label="Duration"),
	gr.Textbox(label="Pitch"),
	gr.Radio(
	label="Singer",
	choices=[
	"singer1 (man)",
	"singer2 (man)",
	"singer3 (female)",
	"singer4 (female)",
	"singer5 (man)",
	"singer6 (female)",
	"singer7 (man)",
	"singer8 (man)",
	"singer9 (female)",
	"singer10 (man)",
	],
	value="singer1 (man)"
	),
	],
	outputs=[
	gr.Audio(label="Generated Song", type="numpy"),
	gr.Textbox(label="Running Status"),
	],
	title=title,
	description=description,
	article=article,
	examples=examples,
	).launch()