File size: 5,427 Bytes
355d903 185fc75 70399da 43f2732 70399da 43f2732 3671108 43f2732 3a1e87b 43f2732 70399da 43f2732 70399da 43f2732 3a1e87b 43f2732 70399da 43f2732 70399da ed2b47f 5d116f7 70399da 185fc75 70399da 185fc75 43f2732 70399da 43f2732 185fc75 70399da 43f2732 f828b7a 70399da |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
import torch
torch.manual_seed(160923)
import gradio as gr
import torch.cuda
from huggingface_hub import hf_hub_download
from InferenceInterfaces.ControllableInterface import ControllableInterface
from Utility.utils import float2pcm
from Utility.utils import load_json_from_path
class TTSWebUI:
def __init__(self,
gpu_id="cpu",
title="Controllable Text-to-Speech for over 7000 Languages",
article="This is running using CPU. <br>",
tts_model_path=None,
vocoder_model_path=None,
embedding_gan_path=None,
available_artificial_voices=32 # be careful with this, if you want too many, it might lead to an endless loop
):
path_to_iso_list = hf_hub_download(repo_id="Flux9665/ToucanTTS", filename="iso_to_fullname.json")
iso_to_name = load_json_from_path(path_to_iso_list)
text_selection = [f"{iso_to_name[iso_code]} ({iso_code})" for iso_code in iso_to_name]
# accent_selection = [f"{iso_to_name[iso_code]} Accent ({iso_code})" for iso_code in iso_to_name]
self.controllable_ui = ControllableInterface(gpu_id=gpu_id,
available_artificial_voices=available_artificial_voices,
tts_model_path=tts_model_path,
vocoder_model_path=vocoder_model_path,
embedding_gan_path=embedding_gan_path)
self.iface = gr.Interface(fn=self.read,
inputs=[gr.Textbox(lines=2,
placeholder="write what you want the synthesis to read here...",
value="What I cannot create, I do not understand.",
label="Text input"),
gr.Dropdown(text_selection,
type="value",
value='English (eng)',
label="Select the Language of the Text (type on your keyboard to find it quickly)"),
gr.Slider(minimum=0.0, maximum=0.8, step=0.1, value=0.5, label="Prosody Creativity"),
gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Faster - Slower"),
gr.Slider(minimum=0, maximum=available_artificial_voices, step=1, value=16, label="Random Seed for the artificial Voice"),
gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Gender of artificial Voice"),
gr.Audio(type="filepath", show_label=True, container=True, label="[OPTIONAL] Voice to Clone (if left empty, will use an artificial voice instead)"),
# gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
# gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
# gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth")
],
outputs=[gr.Audio(type="numpy", label="Speech"),
gr.Image(label="Visualization")],
title=title,
allow_flagging="never",
description=article
)
self.iface.launch()
def read(self,
prompt,
language,
prosody_creativity,
duration_scaling_factor,
voice_seed,
emb1,
reference_audio,
# pitch_variance_scale,
# energy_variance_scale,
# emb2
):
sr, wav, fig = self.controllable_ui.read(prompt,
reference_audio,
language.split(" ")[-1].split("(")[1].split(")")[0],
language.split(" ")[-1].split("(")[1].split(")")[0],
voice_seed,
prosody_creativity,
duration_scaling_factor,
1.,
1.0,
1.0,
emb1,
0.,
0.,
0.,
0.,
0.,
-24.)
return (sr, float2pcm(wav)), fig
if __name__ == '__main__':
TTSWebUI(gpu_id="cuda" if torch.cuda.is_available() else "cpu")
|