Spaces:

nam194
/

text-to-speech

Running

File size: 3,911 Bytes

import os
import time
import gradio as gr
import wave
import numpy as np
from io import BytesIO
from huggingface_hub import login, hf_hub_download
from piper import PiperVoice
from vinorm import TTSnorm
from vi_cleaner.vi_cleaner import ViCleaner
login(os.environ["hf_token"])

def normalize_vietnamese_text(text):
    text = (
        TTSnorm(text, unknown=False, lower=False, rule=True)
        .replace("..", ".")
        .replace("!.", "!")
        .replace("?.", "?")
        .replace(" .", ".")
        .replace(" ,", ",")
        .replace('"', "")
        .replace("'", "")
        .replace("AI", "Ây Ai")
        .replace("A.I", "Ây Ai")
    )
    text_clean = ViCleaner(text).clean()
    return text_clean


def synthesize_speech(text, sentence_silence, length_scale, normalize_text=True):
    model_path = hf_hub_download(
        repo_id="nam194/piper-tts-w5n",
        filename="tts_model.onnx"
    )
    config_path = hf_hub_download(
        repo_id="nam194/piper-tts-w5n",
        filename="tts_model.onnx.json"
    )
    if normalize_text:
        text = normalize_vietnamese_text(text)
        
    voice = PiperVoice.load(model_path, config_path)
    buffer = BytesIO()
    start = time.time()
    with wave.open(buffer, "wb") as wav_file:
        wav_file.setframerate(voice.config.sample_rate)
        wav_file.setsampwidth(2)
        wav_file.setnchannels(1)
        voice.synthesize(
            text, wav_file, sentence_silence=sentence_silence, length_scale=length_scale
        )

    buffer.seek(0)
    audio_data = np.frombuffer(buffer.read(), dtype=np.int16)
    inference_time = time.time() - start
    metric_text = f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
    return (voice.config.sample_rate, audio_data), metric_text


with gr.Blocks(analytics_enabled=False) as demo:
    with gr.Row():
        with gr.Column():
            gr.Markdown(
                """
                # Vietnamese Text-to-speech Demo ✨
                """
            )
        with gr.Column():
            pass

    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(
                label="Text Prompt (Văn bản cần đọc)",
                info="Mỗi câu nên gồm 10 từ trở lên.",
                value="Xin chào, tôi là một mô hình chuyển đổi văn bản thành giọng nói tiếng Việt.",
            )
            sentence_silence = gr.Slider(
                label="Khoảng lặng giữa câu (giây)",
                minimum=0.0,
                maximum=2.0,
                step=0.05,
                value=0.75,
                info="Điều chỉnh độ dài khoảng lặng giữa các câu."
            )
            length_scale = gr.Slider(
                label="Tốc độ đọc",
                minimum=0.5,
                maximum=2.0,
                step=0.05,
                value=1.2,
                info="Điều chỉnh tốc độ đọc (1.0 là tốc độ bình thường)."
            )
            normalize_text = gr.Checkbox(
                label="Chuẩn hóa văn bản tiếng Việt",
                info="Normalize Vietnamese text",
                value=True,
            )
            submit_button = gr.Button(
                "Đọc 🗣️🔥",
                elem_id="send-btn",
                visible=True,
                variant="primary",
            )
        with gr.Column():
            output_audio = gr.Audio(
                label="Synthesised Audio",
                autoplay=True,
            )
            out_text_gr = gr.Text(label="Metrics")

    submit_button.click(
        synthesize_speech,
        inputs=[input_text, 
                sentence_silence, 
                length_scale, 
                normalize_text],
        outputs=[output_audio, out_text_gr],
    )

demo.launch()
demo.launch(debug=True, show_api=True, share=True)