Spaces:

nam194
/

text-to-speech

Running

App Files Files Community

text-to-speech / app.py

nam194

Update app.py

088311b verified about 2 months ago

raw

history blame contribute delete

3.82 kB

	import os, time, pytz, wave
	import numpy as np
	import gradio as gr
	from io import BytesIO
	from datetime import datetime
	from huggingface_hub import login, hf_hub_download
	from piper import PiperVoice
	# from vinorm import TTSnorm
	from vi_cleaner.vi_cleaner import ViCleaner
	login(os.environ["hf_token"])
	TIME_ZONE = "Asia/Ho_Chi_Minh"

	def normalize_vietnamese_text(text):
	text = ViCleaner(text).clean()
	text = text.replace(". ", ".\n")
	return text


	def synthesize_speech(text, sentence_silence, length_scale, normalize_text=True):
	model_path = hf_hub_download(
	repo_id="nam194/piper-tts",
	filename="tts_model.onnx"
	)
	config_path = hf_hub_download(
	repo_id="nam194/piper-tts",
	filename="tts_model.onnx.json"
	)
	if normalize_text:
	text = normalize_vietnamese_text(text)
	print("\n"+"="*80+"\n")
	print("Text synthesized:", text)
	print("Time:", datetime.now(pytz.timezone(TIME_ZONE)).strftime("%m/%d/%Y, %H:%M:%S"))
	print("\n"+"="*80+"\n")
	voice = PiperVoice.load(model_path, config_path)
	buffer = BytesIO()
	start = time.time()
	with wave.open(buffer, "wb") as wav_file:
	wav_file.setframerate(voice.config.sample_rate)
	wav_file.setsampwidth(2)
	wav_file.setnchannels(1)
	voice.synthesize(
	text, wav_file, sentence_silence=sentence_silence, length_scale=length_scale
	)

	buffer.seek(0)
	audio_data = np.frombuffer(buffer.read(), dtype=np.int16)
	inference_time = time.time() - start
	metric_text = f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
	return (voice.config.sample_rate, audio_data), metric_text


	with gr.Blocks(analytics_enabled=False) as demo:
	with gr.Row():
	with gr.Column():
	gr.Markdown(
	"""
	# Vietnamese Text-to-speech Demo ✨
	"""
	)
	with gr.Column():
	pass

	with gr.Row():
	with gr.Column():
	input_text = gr.Textbox(
	label="Text Prompt (Văn bản cần đọc)",
	info="Mỗi câu nên gồm 10 từ trở lên.",
	value="Xin chào, tôi là một mô hình chuyển đổi văn bản thành giọng nói tiếng Việt.",
	)
	sentence_silence = gr.Slider(
	label="Khoảng lặng giữa câu (giây)",
	minimum=0.0,
	maximum=2.0,
	step=0.05,
	value=0.75,
	info="Điều chỉnh độ dài khoảng lặng giữa các câu."
	)
	length_scale = gr.Slider(
	label="Tốc độ đọc",
	minimum=0.5,
	maximum=2.0,
	step=0.05,
	value=1.3,
	info="Điều chỉnh tốc độ đọc (1.0 là tốc độ bình thường)."
	)
	normalize_text = gr.Checkbox(
	label="Chuẩn hóa văn bản tiếng Việt",
	info="Normalize Vietnamese text",
	value=True,
	)
	submit_button = gr.Button(
	"Đọc 🗣️🔥",
	elem_id="send-btn",
	visible=True,
	variant="primary",
	)
	with gr.Column():
	output_audio = gr.Audio(
	label="Synthesised Audio",
	autoplay=True,
	)
	out_text_gr = gr.Text(label="Metrics")

	submit_button.click(
	synthesize_speech,
	inputs=[input_text,
	sentence_silence,
	length_scale,
	normalize_text],
	outputs=[output_audio, out_text_gr],
	)

	# demo.launch()
	demo.launch(debug=True, show_api=True, share=True)