Spaces:

nam194
/

text-to-speech

Running

App Files Files Community

nam194 commited on Dec 28, 2024

Commit

ae4d0e1

verified ·

1 Parent(s): 78d6dbe

Create app.py

Browse files

Files changed (1) hide show

app.py +117 -0

app.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import os
+import time
+import gradio as gr
+import wave
+import numpy as np
+from io import BytesIO
+from huggingface_hub import login, hf_hub_download
+from piper import PiperVoice
+from vinorm import TTSnorm
+login(os.environ["hf_token"])
+def normalize_vietnamese_text(text):
+    text = (
+        TTSnorm(text, unknown=False, lower=False, rule=True)
+        .replace("..", ".")
+        .replace("!.", "!")
+        .replace("?.", "?")
+        .replace(" .", ".")
+        .replace(" ,", ",")
+        .replace('"', "")
+        .replace("'", "")
+        .replace("AI", "Ây Ai")
+        .replace("A.I", "Ây Ai")
+    )
+    return text
+def synthesize_speech(text, sentence_silence, length_scale, normalize_text=True):
+    model_path = hf_hub_download(
+        repo_id="nam194/piper-tts-w5n",
+        filename="tts_model.onnx"
+    )
+    config_path = hf_hub_download(
+        repo_id="nam194/piper-tts-w5n",
+        filename="tts_model.onnx.json"
+    )
+    if normalize_text:
+        text = normalize_vietnamese_text(text)
+    voice = PiperVoice.load(model_path, config_path)
+    buffer = BytesIO()
+    start = time.time()
+    with wave.open(buffer, "wb") as wav_file:
+        wav_file.setframerate(voice.config.sample_rate)
+        wav_file.setsampwidth(2)
+        wav_file.setnchannels(1)
+        voice.synthesize(
+            text, wav_file, sentence_silence=sentence_silence, length_scale=length_scale
+        )
+    buffer.seek(0)
+    audio_data = np.frombuffer(buffer.read(), dtype=np.int16)
+    inference_time = time.time() - start
+    return (voice.config.sample_rate, audio_data), "{}s".format(inference_time)
+with gr.Blocks(analytics_enabled=False) as demo:
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown(
+                """
+                # Vietnamese Text-to-speech Demo ✨
+                """
+            )
+        with gr.Column():
+            pass
+    with gr.Row():
+        with gr.Column():
+            input_text_gr = gr.Textbox(
+                label="Text Prompt (Văn bản cần đọc)",
+                info="Mỗi câu nên từ 10 từ trở lên.",
+                value="Xin chào, tôi là một mô hình chuyển đổi văn bản thành giọng nói tiếng Việt.",
+            )
+            sentence_silence = gr.Slider(
+                label="Khoảng lặng giữa câu (giây)",
+                minimum=0.0,
+                maximum=2.0,
+                step=0.05,
+                value=0.1,
+                info="Điều chỉnh độ dài khoảng lặng giữa các câu."
+            )
+            length_scale = gr.Slider(
+                label="Tốc độ đọc",
+                minimum=0.5,
+                maximum=2.0,
+                step=0.05,
+                value=1.0,
+                info="Điều chỉnh tốc độ đọc (1.0 là tốc độ bình thường)."
+            )
+            normalize_text = gr.Checkbox(
+                label="Chuẩn hóa văn bản tiếng Việt",
+                info="Normalize Vietnamese text",
+                value=True,
+            )
+            submit_button = gr.Button(
+                "Đọc 🗣️🔥",
+                elem_id="send-btn",
+                visible=True,
+                variant="primary",
+            )
+        with gr.Column():
+            output_audio = gr.Audio(
+                label="Synthesised Audio",
+                autoplay=True,
+            )
+            out_text_gr = gr.Text(label="Metrics")
+    submit_button.click(
+        synthesize_speech,
+        inputs=[input_text,
+                sentence_silence,
+                length_scale,
+                normalize_text],
+        outputs=[output_audio, out_text_gr],
+    )
+demo.launch()