Spaces:

Genius-Society
/

hoyoTTS

Running

hoyoTTS / app.py

admin

, show_share_button=False

880b196 1 day ago

10.7 kB

	import re
	import os
	import sys
	import utils
	import torch
	import random
	import commons
	import numpy as np
	import gradio as gr
	from tqdm import tqdm
	from models import SynthesizerTrn
	from huggingface_hub import snapshot_download
	from text import cleaned_text_to_sequence, get_bert
	from text.cleaner import clean_text
	from text.symbols import symbols


	if sys.platform == "darwin":
	os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

	import logging

	logging.getLogger("numba").setLevel(logging.WARNING)
	logging.getLogger("markdown_it").setLevel(logging.WARNING)
	logging.getLogger("urllib3").setLevel(logging.WARNING)
	logging.getLogger("matplotlib").setLevel(logging.WARNING)
	logging.basicConfig(
	level=logging.INFO, format="\| %(name)s \| %(levelname)s \| %(message)s"
	)

	logger = logging.getLogger(__name__)
	net_g = None
	debug = False


	def get_text(text, language_str, hps):
	norm_text, phone, tone, word2ph = clean_text(text, language_str)
	phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
	if hps.data.add_blank:
	phone = commons.intersperse(phone, 0)
	tone = commons.intersperse(tone, 0)
	language = commons.intersperse(language, 0)
	for i in range(len(word2ph)):
	word2ph[i] = word2ph[i] * 2

	word2ph[0] += 1

	bert = get_bert(norm_text, word2ph, language_str)
	del word2ph
	assert bert.shape[-1] == len(phone)
	phone = torch.LongTensor(phone)
	tone = torch.LongTensor(tone)
	language = torch.LongTensor(language)
	return bert, phone, tone, language


	def infer(text, sdp_ratio, noise_scale, noise_scale_w, length_scale, sid):
	global net_g
	bert, phones, tones, lang_ids = get_text(text, "ZH", hps)
	with torch.no_grad():
	x_tst = phones.to(device).unsqueeze(0)
	tones = tones.to(device).unsqueeze(0)
	lang_ids = lang_ids.to(device).unsqueeze(0)
	bert = bert.to(device).unsqueeze(0)
	x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
	del phones
	speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
	audio = (
	net_g.infer(
	x_tst,
	x_tst_lengths,
	speakers,
	tones,
	lang_ids,
	bert,
	sdp_ratio=sdp_ratio,
	noise_scale=noise_scale,
	noise_scale_w=noise_scale_w,
	length_scale=length_scale,
	)[0][0, 0]
	.data.cpu()
	.float()
	.numpy()
	)
	del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers
	return audio


	def tts_fn(text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale):
	with torch.no_grad():
	audio = infer(
	text,
	sdp_ratio=sdp_ratio,
	noise_scale=noise_scale,
	noise_scale_w=noise_scale_w,
	length_scale=length_scale,
	sid=speaker,
	)

	return (hps.data.sampling_rate, audio)


	def text_splitter(text: str):
	punctuation = r"[。,；,！,？,〜,\n,\r,\t,.,!,;,?,~, ]"
	sentences = re.split(punctuation, text.strip())
	return [sentence.strip() for sentence in sentences if sentence.strip()]


	def concatenate_audios(audio_samples, sample_rate=44100):
	half_second_silence = np.zeros(int(sample_rate / 2))
	final_audio = audio_samples[0]
	for sample in audio_samples[1:]:
	final_audio = np.concatenate((final_audio, half_second_silence, sample))

	print("Audio pieces concatenated!")
	return (sample_rate, final_audio)


	def read_text(file_path: str):
	try:
	with open(file_path, "r", encoding="utf-8") as file:
	content = file.read()
	return content

	except FileNotFoundError:
	print(f"File Not Found: {file_path}")

	except IOError:
	print(f"An error occurred reading the file: {file_path}")

	except Exception as e:
	print(f"An unknown error has occurred: {e}")


	def infer_tab1(text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale):
	try:
	content = read_text(text)
	sentences = text_splitter(content)
	audios = []
	for sentence in tqdm(sentences, desc="TTS inferring..."):
	with torch.no_grad():
	audios.append(
	infer(
	sentence,
	sdp_ratio=sdp_ratio,
	noise_scale=noise_scale,
	noise_scale_w=noise_scale_w,
	length_scale=length_scale,
	sid=speaker,
	)
	)

	return concatenate_audios(audios, hps.data.sampling_rate), content

	except Exception as e:
	return None, f"{e}"


	def infer_tab2(content, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale):
	try:
	sentences = text_splitter(content)
	audios = []
	for sentence in tqdm(sentences, desc="TTS inferring..."):
	with torch.no_grad():
	audios.append(
	infer(
	sentence,
	sdp_ratio=sdp_ratio,
	noise_scale=noise_scale,
	noise_scale_w=noise_scale_w,
	length_scale=length_scale,
	sid=speaker,
	)
	)

	return concatenate_audios(audios, hps.data.sampling_rate)

	except Exception as e:
	print(f"{e}")
	return None


	if __name__ == "__main__":
	model_dir = snapshot_download("Genius-Society/hoyoTTS", cache_dir="./__pycache__")
	if debug:
	logger.info("Enable DEBUG-LEVEL log")
	logging.basicConfig(level=logging.DEBUG)

	hps = utils.get_hparams_from_dir(model_dir)
	device = (
	"cuda:0"
	if torch.cuda.is_available()
	else (
	"mps"
	if sys.platform == "darwin" and torch.backends.mps.is_available()
	else "cpu"
	)
	)
	net_g = SynthesizerTrn(
	len(symbols),
	hps.data.filter_length // 2 + 1,
	hps.train.segment_size // hps.data.hop_length,
	n_speakers=hps.data.n_speakers,
	**hps.model,
	).to(device)
	net_g.eval()
	utils.load_checkpoint(f"{model_dir}/G_78000.pth", net_g, None, skip_optimizer=True)
	speaker_ids = hps.data.spk2id
	speakers = list(speaker_ids.keys())
	random.shuffle(speakers)
	with gr.Blocks() as app:
	gr.Markdown(
	"""
	Welcome to the Space, which is based on the open source project <a href="https://github.com/fishaudio/Bert-VITS2">Bert-vits2</a>, and moved to the bottom for an explanation of the principle. This Space must be used in accordance with local laws and regulations, prohibiting the use of it for any criminal activities."""
	)

	with gr.Tab("Input Mode"):
	gr.Interface(
	fn=infer_tab2,
	inputs=[
	gr.TextArea(
	label="Please input the Simplified Chinese text",
	placeholder="The first inference takes time to download the model, so be patient.",
	show_copy_button=True,
	),
	gr.Dropdown(choices=speakers, value="莱依拉", label="Role"),
	gr.Slider(
	minimum=0,
	maximum=1,
	value=0.2,
	step=0.1,
	label="Modulation of intonation",
	), # SDP/DP Mix Ratio
	gr.Slider(
	minimum=0.1,
	maximum=2,
	value=0.6,
	step=0.1,
	label="Emotional adjustment",
	),
	gr.Slider(
	minimum=0.1,
	maximum=2,
	value=0.8,
	step=0.1,
	label="Phoneme length",
	),
	gr.Slider(
	minimum=0.1,
	maximum=2,
	value=1,
	step=0.1,
	label="Output duration",
	),
	],
	outputs=gr.Audio(label="Output Audio", show_share_button=False),
	flagging_mode="never",
	concurrency_limit=4,
	)

	with gr.Tab("Upload Mode"):
	gr.Interface(
	fn=infer_tab1, # Use text_to_speech func
	inputs=[
	gr.components.File(
	label="Please upload a simplified Chinese TXT",
	type="filepath",
	file_types=[".txt"],
	),
	gr.Dropdown(choices=speakers, value="莱依拉", label="Role"),
	gr.Slider(
	minimum=0,
	maximum=1,
	value=0.2,
	step=0.1,
	label="Modulation of intonation",
	),
	gr.Slider(
	minimum=0.1,
	maximum=2,
	value=0.6,
	step=0.1,
	label="Emotional adjustment",
	),
	gr.Slider(
	minimum=0.1,
	maximum=2,
	value=0.8,
	step=0.1,
	label="Phoneme length",
	),
	gr.Slider(
	minimum=0.1,
	maximum=2,
	value=1,
	step=0.1,
	label="Output duration",
	),
	],
	outputs=[
	gr.Audio(label="Output Audio", show_share_button=False),
	gr.TextArea(
	label="Result of TXT extraction",
	show_copy_button=True,
	),
	],
	flagging_mode="never",
	concurrency_limit=4,
	)

	gr.HTML(
	"""
	<iframe src="//player.bilibili.com/player.html?bvid=BV1hergYRENX&p=2&autoplay=0" scrolling="no" border="0" frameborder="no" framespacing="0" allowfullscreen="true" width="100%" style="aspect-ratio: 16 / 9;">
	</iframe>
	"""
	)

	app.launch()