Spaces:
Sleeping
Sleeping
File size: 5,786 Bytes
7e9eac8 d853661 b6e1649 7e9eac8 b6e1649 7e9eac8 b6e1649 7e9eac8 c7604ad 05c8449 c7604ad 7e9eac8 69f50e2 684c796 69f50e2 684c796 69f50e2 7e9eac8 bfd6986 7e9eac8 69f50e2 7e9eac8 69f50e2 7e9eac8 69f50e2 7e9eac8 69f50e2 7e9eac8 69f50e2 7e9eac8 ebb01fc 7e9eac8 684c796 7e9eac8 ebb01fc 7e9eac8 dd6a80b 6a19fc4 dd6a80b 50e49e1 dd6a80b 7e9eac8 dd6a80b 7e9eac8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 |
import logging
import os
import time
import uuid
import gradio as gr
import soundfile as sf
from model import get_pretrained_model, language_to_models
title = "# Next-gen Kaldi: Text-to-speech (TTS)"
description = """
This space shows how to convert text to speech with Next-gen Kaldi.
It is running on CPU within a docker container provided by Hugging Face.
See more information by visiting the following links:
- <https://github.com/k2-fsa/sherpa-onnx>
If you want to deploy it locally, please see
<https://k2-fsa.github.io/sherpa/>
If you want to use Android APKs, please see
<https://k2-fsa.github.io/sherpa/onnx/tts/apk.html>
If you want to use Android text-to-speech engine APKs, please see
<https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine.html>
If you want to download an all-in-one exe for Windows, please see
<https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models>
"""
css = """
.result {display:flex;flex-direction:column}
.result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
.result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
.result_item_error {background-color:#ff7070;color:white;align-self:start}
"""
# Simplified examples for Portuguese only
examples = [
["Portuguese", "csukuangfj/vits-mms-por", "Eu desejo uma versão simplificada para português.", 0, 1.0],
]
# Use only Portuguese as a language choice
language_choices = ["Portuguese"]
def update_model_dropdown(language: str):
if language in language_to_models:
choices = language_to_models[language]
return gr.Dropdown(
choices=choices,
value=choices[0],
interactive=True,
)
raise ValueError(f"Unsupported language: {language}")
def build_html_output(s: str, style: str = "result_item_success"):
return f"""
<div class='result'>
<div class='result_item {style}'>
{s}
</div>
</div>
"""
def process(language: str, repo_id: str, text: str, sid: str, speed: float):
logging.info(f"Input text: {text}. sid: {sid}, speed: {speed}")
sid = int(sid)
tts = get_pretrained_model(repo_id, speed)
start = time.time()
audio = tts.generate(text, sid=sid)
end = time.time()
if len(audio.samples) == 0:
raise ValueError(
"Error in generating audios. Please read previous error messages."
)
duration = len(audio.samples) / audio.sample_rate
elapsed_seconds = end - start
rtf = elapsed_seconds / duration
info = f"""
Wave duration : {duration:.3f} s <br/>
Processing time: {elapsed_seconds:.3f} s <br/>
RTF: {elapsed_seconds:.3f}/{duration:.3f} = {rtf:.3f} <br/>
"""
logging.info(info)
logging.info(f"\nrepo_id: {repo_id}\ntext: {text}\nsid: {sid}\nspeed: {speed}")
filename = str(uuid.uuid4())
filename = f"{filename}.wav"
sf.write(
filename,
audio.samples,
samplerate=audio.sample_rate,
subtype="PCM_16",
)
return filename, build_html_output(info)
demo = gr.Blocks(css=css)
with demo:
gr.Markdown(title)
# Use Radio instead of Dropdown for language choice
language_radio = gr.Radio(
label="Language",
choices=language_choices,
value=language_choices[0],
)
# Initialize model_dropdown with Portuguese models
model_dropdown = gr.Dropdown(
choices=language_to_models["Portuguese"],
label="Select a model",
value=language_to_models["Portuguese"][0],
)
# No need to update model_dropdown for a single language
with gr.Tabs():
with gr.TabItem("Please input your text"):
input_text = gr.Textbox(
label="Input text",
info="Your text",
lines=3,
placeholder="Please input your text here",
)
input_sid = gr.Textbox(
label="Speaker ID",
info="Speaker ID",
lines=1,
max_lines=1,
value="0",
placeholder="Speaker ID. Valid only for mult-speaker model",
)
input_speed = gr.Slider(
minimum=0.1,
maximum=10,
value=1,
step=0.1,
label="Speed (larger->faster; smaller->slower)",
)
input_button = gr.Button("Submit")
output_audio = gr.Audio(label="Output")
output_info = gr.HTML(label="Info")
gr.Examples(
examples=examples,
fn=process,
inputs=[
language_radio,
model_dropdown,
input_text,
input_sid,
input_speed,
],
outputs=[
output_audio,
output_info,
],
)
input_button.click(
process,
inputs=[
language_radio,
model_dropdown,
input_text,
input_sid,
input_speed,
],
outputs=[
output_audio,
output_info,
],
)
gr.Markdown(description)
def download_espeak_ng_data():
os.system(
"""
cd /tmp
wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2
tar xf espeak-ng-data.tar.bz2
"""
)
if __name__ == "__main__":
download_espeak_ng_data()
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
logging.basicConfig(format=formatter, level=logging.INFO)
demo.launch()
|