|
""" |
|
Copyright 2022 Balacoon |
|
|
|
TTS interactive demo |
|
""" |
|
|
|
import os |
|
import glob |
|
import logging |
|
from typing import cast |
|
|
|
import gradio as gr |
|
from balacoon_tts import TTS |
|
from huggingface_hub import hf_hub_download, list_repo_files |
|
|
|
|
|
tts = None |
|
|
|
cur_model_path = None |
|
|
|
model_to_speakers = dict() |
|
model_repo_dir = "data" |
|
for name in list_repo_files(repo_id="balacoon/tts"): |
|
hf_hub_download( |
|
repo_id="balacoon/tts", |
|
filename=name, |
|
local_dir=model_repo_dir, |
|
) |
|
|
|
|
|
def main(): |
|
logging.basicConfig(level=logging.INFO) |
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown( |
|
""" |
|
<h1 align="center">Balacoon🦝 Text-to-Speech</h1> |
|
|
|
1. Write an utterance to generate, |
|
2. Select the model to synthesize with |
|
3. Select speaker |
|
4. Hit "Generate" and listen to the result! |
|
|
|
You can learn more about models available |
|
[here](https://huggingface.co/balacoon/tts). |
|
Visit [Balacoon website](https://balacoon.com/) for more info. |
|
""" |
|
) |
|
with gr.Row(variant="panel"): |
|
text = gr.Textbox(label="Text", placeholder="Type something here...") |
|
|
|
with gr.Row(): |
|
with gr.Column(variant="panel"): |
|
repo_files = os.listdir(model_repo_dir) |
|
model_files = [x for x in repo_files if x.endswith("_cpu.addon")] |
|
model_name = gr.Dropdown( |
|
label="Model", |
|
choices=model_files, |
|
) |
|
with gr.Column(variant="panel"): |
|
speaker = gr.Dropdown(label="Speaker", choices=[]) |
|
|
|
def set_model(model_name_str: str): |
|
""" |
|
gets value from `model_name`. either |
|
uses cached list of speakers for the given model name |
|
or loads the addon and checks what are the speakers. |
|
""" |
|
global model_to_speakers |
|
if model_name_str in model_to_speakers: |
|
speakers = model_to_speakers[model_name_str] |
|
else: |
|
global tts, cur_model_path |
|
|
|
model_path = os.path.join(model_repo_dir, model_name_str) |
|
tts = TTS(model_path) |
|
cur_model_path = model_path |
|
speakers = tts.get_speakers() |
|
model_to_speakers[model_name_str] = speakers |
|
|
|
value = speakers[-1] |
|
return gr.Dropdown.update( |
|
choices=speakers, value=value, visible=True |
|
) |
|
|
|
model_name.change(set_model, inputs=model_name, outputs=speaker) |
|
|
|
with gr.Row(variant="panel"): |
|
generate = gr.Button("Generate") |
|
with gr.Row(variant="panel"): |
|
audio = gr.Audio() |
|
|
|
def synthesize_audio(text_str: str, model_name_str: str, speaker_str: str): |
|
""" |
|
gets utterance to synthesize from `text` Textbox |
|
and speaker name from `speaker` dropdown list. |
|
speaker name might be empty for single-speaker models. |
|
Synthesizes the waveform and updates `audio` with it. |
|
""" |
|
if not text_str or not model_name_str or not speaker_str: |
|
logging.info("text, model name or speaker are not provided") |
|
return None |
|
expected_model_path = os.path.join(model_repo_dir, model_name_str) |
|
global tts, cur_model_path |
|
if expected_model_path != cur_model_path: |
|
|
|
tts = TTS(expected_model_path) |
|
cur_model_path = expected_model_path |
|
if len(text_str) > 1024: |
|
|
|
text_str = text_str[:1024] |
|
samples = tts.synthesize(text_str, speaker_str) |
|
return gr.Audio.update(value=(tts.get_sampling_rate(), samples)) |
|
|
|
generate.click(synthesize_audio, inputs=[text, model_name, speaker], outputs=audio) |
|
|
|
demo.queue(concurrency_count=1).launch() |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|