Spaces:

nam194
/

text-to-speech

Running

App Files Files Community

text-to-speech / app.py

nam194

Update app.py

339e016 verified 6 months ago

raw

history blame

4.19 kB

	import os, time, pytz, wave
	import numpy as np
	import gradio as gr
	from io import BytesIO
	from datetime import datetime
	from huggingface_hub import login, hf_hub_download
	from piper import PiperVoice
	# from vinorm import TTSnorm
	from vi_cleaner.vi_cleaner import ViCleaner
	login(os.environ["hf_token"])
	TIME_ZONE = "Asia/Ho_Chi_Minh"

	def normalize_vietnamese_text(text):
	# text = (
	# TTSnorm(text, unknown=False, lower=False, rule=True)
	# .replace("..", ".")
	# .replace("!.", "!")
	# .replace("?.", "?")
	# .replace(" .", ".")
	# .replace(" ,", ",")
	# .replace('"', "")
	# .replace("'", "")
	# .replace("AI", "Ây Ai")
	# .replace("A.I", "Ây Ai")
	# )
	text = ViCleaner(text).clean()
	text = text.replace(". ", ".\n")
	return text


	def synthesize_speech(text, sentence_silence, length_scale, normalize_text=True):
	model_path = hf_hub_download(
	repo_id="nam194/piper-tts-w5n",
	filename="tts_model.onnx"
	)
	config_path = hf_hub_download(
	repo_id="nam194/piper-tts-w5n",
	filename="tts_model.onnx.json"
	)
	if normalize_text:
	text = normalize_vietnamese_text(text)
	print("\n"+"="*80+"\n")
	print("Text synthesized:", text)
	print("Time:", datetime.now(pytz.timezone(TIME_ZONE)).strftime("%m/%d/%Y, %H:%M:%S"))
	print("\n"+"="*80+"\n")
	voice = PiperVoice.load(model_path, config_path)
	buffer = BytesIO()
	start = time.time()
	with wave.open(buffer, "wb") as wav_file:
	wav_file.setframerate(voice.config.sample_rate)
	wav_file.setsampwidth(2)
	wav_file.setnchannels(1)
	voice.synthesize(
	text, wav_file, sentence_silence=sentence_silence, length_scale=length_scale
	)

	buffer.seek(0)
	audio_data = np.frombuffer(buffer.read(), dtype=np.int16)
	inference_time = time.time() - start
	metric_text = f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
	return (voice.config.sample_rate, audio_data), metric_text


	with gr.Blocks(analytics_enabled=False) as demo:
	with gr.Row():
	with gr.Column():
	gr.Markdown(
	"""
	# Vietnamese Text-to-speech Demo ✨
	"""
	)
	with gr.Column():
	pass

	with gr.Row():
	with gr.Column():
	input_text = gr.Textbox(
	label="Text Prompt (Văn bản cần đọc)",
	info="Mỗi câu nên gồm 10 từ trở lên.",
	value="Xin chào, tôi là một mô hình chuyển đổi văn bản thành giọng nói tiếng Việt.",
	)
	sentence_silence = gr.Slider(
	label="Khoảng lặng giữa câu (giây)",
	minimum=0.0,
	maximum=2.0,
	step=0.05,
	value=0.75,
	info="Điều chỉnh độ dài khoảng lặng giữa các câu."
	)
	length_scale = gr.Slider(
	label="Tốc độ đọc",
	minimum=0.5,
	maximum=2.0,
	step=0.05,
	value=1.3,
	info="Điều chỉnh tốc độ đọc (1.0 là tốc độ bình thường)."
	)
	normalize_text = gr.Checkbox(
	label="Chuẩn hóa văn bản tiếng Việt",
	info="Normalize Vietnamese text",
	value=True,
	)
	submit_button = gr.Button(
	"Đọc 🗣️🔥",
	elem_id="send-btn",
	visible=True,
	variant="primary",
	)
	with gr.Column():
	output_audio = gr.Audio(
	label="Synthesised Audio",
	autoplay=True,
	)
	out_text_gr = gr.Text(label="Metrics")

	submit_button.click(
	synthesize_speech,
	inputs=[input_text,
	sentence_silence,
	length_scale,
	normalize_text],
	outputs=[output_audio, out_text_gr],
	)

	demo.launch()
	demo.launch(debug=True, show_api=True, share=True)