Spaces:

nam194
/

text-to-speech

Running

App Files Files Community

text-to-speech / app.py

nam194

Update app.py

eb557de verified 7 months ago

raw

history blame

3.91 kB

	import os
	import time
	import gradio as gr
	import wave
	import numpy as np
	from io import BytesIO
	from huggingface_hub import login, hf_hub_download
	from piper import PiperVoice
	from vinorm import TTSnorm
	from vi_cleaner.vi_cleaner import ViCleaner
	login(os.environ["hf_token"])

	def normalize_vietnamese_text(text):
	text = (
	TTSnorm(text, unknown=False, lower=False, rule=True)
	.replace("..", ".")
	.replace("!.", "!")
	.replace("?.", "?")
	.replace(" .", ".")
	.replace(" ,", ",")
	.replace('"', "")
	.replace("'", "")
	.replace("AI", "Ây Ai")
	.replace("A.I", "Ây Ai")
	)
	text_clean = ViCleaner(text).clean()
	return text_clean


	def synthesize_speech(text, sentence_silence, length_scale, normalize_text=True):
	model_path = hf_hub_download(
	repo_id="nam194/piper-tts-w5n",
	filename="tts_model.onnx"
	)
	config_path = hf_hub_download(
	repo_id="nam194/piper-tts-w5n",
	filename="tts_model.onnx.json"
	)
	if normalize_text:
	text = normalize_vietnamese_text(text)

	voice = PiperVoice.load(model_path, config_path)
	buffer = BytesIO()
	start = time.time()
	with wave.open(buffer, "wb") as wav_file:
	wav_file.setframerate(voice.config.sample_rate)
	wav_file.setsampwidth(2)
	wav_file.setnchannels(1)
	voice.synthesize(
	text, wav_file, sentence_silence=sentence_silence, length_scale=length_scale
	)

	buffer.seek(0)
	audio_data = np.frombuffer(buffer.read(), dtype=np.int16)
	inference_time = time.time() - start
	metric_text = f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
	return (voice.config.sample_rate, audio_data), metric_text


	with gr.Blocks(analytics_enabled=False) as demo:
	with gr.Row():
	with gr.Column():
	gr.Markdown(
	"""
	# Vietnamese Text-to-speech Demo ✨
	"""
	)
	with gr.Column():
	pass

	with gr.Row():
	with gr.Column():
	input_text = gr.Textbox(
	label="Text Prompt (Văn bản cần đọc)",
	info="Mỗi câu nên gồm 10 từ trở lên.",
	value="Xin chào, tôi là một mô hình chuyển đổi văn bản thành giọng nói tiếng Việt.",
	)
	sentence_silence = gr.Slider(
	label="Khoảng lặng giữa câu (giây)",
	minimum=0.0,
	maximum=2.0,
	step=0.05,
	value=0.75,
	info="Điều chỉnh độ dài khoảng lặng giữa các câu."
	)
	length_scale = gr.Slider(
	label="Tốc độ đọc",
	minimum=0.5,
	maximum=2.0,
	step=0.05,
	value=1.2,
	info="Điều chỉnh tốc độ đọc (1.0 là tốc độ bình thường)."
	)
	normalize_text = gr.Checkbox(
	label="Chuẩn hóa văn bản tiếng Việt",
	info="Normalize Vietnamese text",
	value=True,
	)
	submit_button = gr.Button(
	"Đọc 🗣️🔥",
	elem_id="send-btn",
	visible=True,
	variant="primary",
	)
	with gr.Column():
	output_audio = gr.Audio(
	label="Synthesised Audio",
	autoplay=True,
	)
	out_text_gr = gr.Text(label="Metrics")

	submit_button.click(
	synthesize_speech,
	inputs=[input_text,
	sentence_silence,
	length_scale,
	normalize_text],
	outputs=[output_audio, out_text_gr],
	)

	demo.launch()
	demo.launch(debug=True, show_api=True, share=True)