Spaces:

projecte-aina
/

matxa-alvocat-tts-ca

Running

App Files Files Community

matxa-alvocat-tts-ca / infer_onnx.py

wetdog

set fixed path for temp files and expose parameters in the interface

6b0bcdf 10 months ago

raw

history blame

6.77 kB

	import numpy as np
	import onnxruntime

	import utils
	from text import text_to_sequence, sequence_to_text
	import torch
	import gradio as gr
	import soundfile as sf
	import tempfile
	import yaml

	from time import perf_counter

	def intersperse(lst, item):
	result = [item] * (len(lst) * 2 + 1)
	result[1::2] = lst
	return result


	def process_text(i: int, text: str, device: torch.device):
	print(f"[{i}] - Input text: {text}")
	x = torch.tensor(
	intersperse(text_to_sequence(text, ["catalan_cleaners"]), 0),
	dtype=torch.long,
	device=device,
	)[None]
	x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=device)
	x_phones = sequence_to_text(x.squeeze(0).tolist())
	print(x_phones)
	return x.numpy(), x_lengths.numpy()

	MODEL_PATH_MATCHA_MEL="matcha_multispeaker_cat_opset_15_10_steps.onnx"
	MODEL_PATH_MATCHA="matcha_hifigan_multispeaker_cat.onnx"
	MODEL_PATH_VOCOS="mel_spec_22khz.onnx"
	CONFIG_PATH="config_22khz.yaml"

	sess_options = onnxruntime.SessionOptions()
	model_matcha_mel= onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA_MEL), sess_options=sess_options, providers=["CPUExecutionProvider"])
	model_vocos = onnxruntime.InferenceSession(str(MODEL_PATH_VOCOS), sess_options=sess_options, providers=["CPUExecutionProvider"])
	model_matcha = onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA), sess_options=sess_options, providers=["CPUExecutionProvider"])


	def vocos_inference(mel):

	with open(CONFIG_PATH, "r") as f:
	config = yaml.safe_load(f)

	params = config["feature_extractor"]["init_args"]
	sample_rate = params["sample_rate"]
	n_fft= params["n_fft"]
	hop_length= params["hop_length"]
	win_length = n_fft

	# ONNX inference
	mag, x, y = model_vocos.run(
	None,
	{
	"mels": mel
	},
	)

	# complex spectrogram from vocos output
	spectrogram = mag * (x + 1j * y)
	window = torch.hann_window(win_length)

	# Inverse stft
	pad = (win_length - hop_length) // 2
	spectrogram = torch.tensor(spectrogram)
	B, N, T = spectrogram.shape

	print("Spectrogram synthesized shape", spectrogram.shape)
	# Inverse FFT
	ifft = torch.fft.irfft(spectrogram, n_fft, dim=1, norm="backward")
	ifft = ifft * window[None, :, None]

	# Overlap and Add
	output_size = (T - 1) * hop_length + win_length
	y = torch.nn.functional.fold(
	ifft, output_size=(1, output_size), kernel_size=(1, win_length), stride=(1, hop_length),
	)[:, 0, 0, pad:-pad]

	# Window envelope
	window_sq = window.square().expand(1, T, -1).transpose(1, 2)
	window_envelope = torch.nn.functional.fold(
	window_sq, output_size=(1, output_size), kernel_size=(1, win_length), stride=(1, hop_length),
	).squeeze()[pad:-pad]

	# Normalize
	assert (window_envelope > 1e-11).all()
	y = y / window_envelope

	return y


	def tts(text:str, spk_id:int, temperature:float, length_scale:float):
	sid = np.array([int(spk_id)]) if spk_id is not None else None
	text_matcha , text_lengths = process_text(0,text,"cpu")

	# MATCHA VOCOS
	inputs = {
	"x": text_matcha,
	"x_lengths": text_lengths,
	"scales": np.array([temperature, length_scale], dtype=np.float32),
	"spks": sid
	}
	mel_t0 = perf_counter()
	# matcha mel inference
	mel, mel_lengths = model_matcha_mel.run(None, inputs)
	mel_infer_secs = perf_counter() - mel_t0
	print("Matcha Mel inference time", mel_infer_secs)

	vocos_t0 = perf_counter()
	# vocos inference
	wavs_vocos = vocos_inference(mel)
	vocos_infer_secs = perf_counter() - vocos_t0
	print("Vocos inference time", vocos_infer_secs)

	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False, dir="/home/user/app") as fp_matcha_vocos:
	sf.write(fp_matcha_vocos.name, wavs_vocos.squeeze(0), 22050, "PCM_24")

	#MATCHA HIFIGAN

	inputs = {
	"x": text_matcha,
	"x_lengths": text_lengths,
	"scales": np.array([temperature, length_scale], dtype=np.float32),
	"spks": sid
	}
	hifigan_t0 = perf_counter()
	# matcha hifigan inference
	wavs, wav_lengths = model_matcha.run(None, inputs)
	hifigan_infer_secs = perf_counter() - hifigan_t0
	print("Matcha + Hifigan",hifigan_infer_secs)

	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False, dir="/home/user/app") as fp_matcha:
	sf.write(fp_matcha.name, wavs.squeeze(0), 22050, "PCM_24")

	print(f"RTF matcha + hifigan { hifigan_infer_secs/ (wavs.shape[1]/22050) }")
	print(f"RTF matcha + vocos { (mel_infer_secs + vocos_infer_secs) / (wavs.shape[1]/22050) }")
	return fp_matcha_vocos.name, fp_matcha.name

	## GUI space

	title = """
	<div style="text-align: center; max-width: 700px; margin: 0 auto;">
	<div
	style="display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;"
	> <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
	TTS Vocoder Comparison
	</h1> </div>
	</div>
	"""

	description = """

	🍵 Matcha-TTS, a new approach to non-autoregressive neural TTS, that uses conditional flow matching (similar to rectified flows) to speed up ODE-based speech synthesis

	For vocoders we use Hifigan universal version and Vocos trained in a catalan set of ~28 hours.

	Matcha was trained using openslr69 and festcat datasets
	"""

	article = "Training and demo by BSC."

	vits2_inference = gr.Interface(
	fn=tts,
	inputs=[
	gr.Textbox(
	value="m'ha costat molt desenvolupar una veu, i ara que la tinc no estaré en silenci.",
	max_lines=1,
	label="Input text",
	),
	gr.Slider(
	1,
	47,
	value=10,
	step=1,
	label="Speaker id",
	info=f"Models are trained on 47 speakers. You can prompt the model using one of these speaker ids.",
	),
	gr.Slider(
	0.1,
	2.0,
	value=0.667,
	step=0.01,
	label="Temperature",
	info=f"Temperature",
	),
	gr.Slider(
	0.5,
	2.0,
	value=1.0,
	step=0.01,
	label="Length scale",
	info=f"Controls speech pace, larger values for slower pace and smaller values for faster pace",
	)
	],
	outputs=[gr.Audio(label="Matcha vocos", interactive=False, type="filepath"),
	gr.Audio(label="Matcha hifigan", interactive=False, type="filepath")]
	)

	demo = gr.Blocks()

	with demo:
	gr.Markdown(title)
	gr.Markdown(description)
	gr.TabbedInterface([vits2_inference], ["Multispeaker"])
	gr.Markdown(article)

	demo.queue(max_size=10)
	demo.launch(show_api=False, server_name="0.0.0.0", server_port=7860)