Spaces:
Sleeping
Sleeping
File size: 11,022 Bytes
f5915fd 4028449 f5915fd 4028449 f5915fd 635f007 1567c2d d430de8 a520615 85fcd3c 73dbaa9 a520615 4028449 635f007 1c22955 e2e4977 4028449 50a9d0f 6eb9ea3 b21342c f5915fd b21342c e27b102 138b27f fbe2075 85fcd3c e27b102 62de84c e27b102 a5cfbc2 e1d1d80 6440f80 e2e4977 138b27f efc3af1 fbe2075 3ecf742 6440f80 efc3af1 dc39974 efc3af1 6440f80 efc3af1 6440f80 46e08d9 6440f80 4028449 d430de8 138b27f fbe2075 3ecf742 6440f80 1567c2d 6440f80 d430de8 635f007 04c285e d430de8 39e26fe 4028449 addff22 d430de8 04c285e 4b8ade9 e2e4977 04c285e cf9af86 23a6d4b e1d1d80 e2e4977 04c285e e1d1d80 a5cfbc2 4b8ade9 cf9af86 4b8ade9 04c285e 6440f80 635f007 f5915fd 635f007 4028449 e7f95df 50a9d0f 5967c32 50a9d0f c609d03 50a9d0f 769b7d2 635f007 304fce9 836bb59 635f007 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
INTROTXT = """# StyleTTS 2
kudos to mrfakename for the base gradio code I'm borrowing here.
日本語用
The Text-guided inference may or may not work. you can only do inference max 512 tokens.
**
"""
import gradio as gr
import styletts2importable
import ljspeechimportable
import torch
import os
from txtsplit import txtsplit
import numpy as np
import pickle
theme = gr.themes.Base(theme="NoCrypt/miku",
font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'],
)
VO_JA_Kamisato_Ayaka_About_Kujou_Sara.wav
voicelist = ['VO_JA_Kamisato_Ayaka_About_Kujou_Sara','hontonokimochi','gaen_original']
voices = {}
# import phonemizer
# global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)
# todo: cache computed style, load using pickle
# if os.path.exists('voices.pkl'):
# with open('voices.pkl', 'rb') as f:
# voices = pickle.load(f)
# else:
for v in voicelist:
voices[v] = styletts2importable.compute_style(f'voices/{v}.wav')
# def synthesize(text, voice, multispeakersteps):
# if text.strip() == "":
# raise gr.Error("You must enter some text")
# # if len(global_phonemizer.phonemize([text])) > 300:
# if len(text) > 300:
# raise gr.Error("Text must be under 300 characters")
# v = voice.lower()
# # return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=7, embedding_scale=1))
# return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=multispeakersteps, embedding_scale=1))
if not torch.cuda.is_available(): INTROTXT += "\n\n### You are on a CPU-only system, inference will be much slower.\n\nYou can use the [online demo](https://huggingface.co/spaces/styletts2/styletts2) for fast inference."
def synthesize(text, voice, lngsteps, password, progress=gr.Progress()):
if text.strip() == "":
raise gr.Error("You must enter some text")
if len(text) > 50000:
raise gr.Error("Text must be <50k characters")
print("*** saying ***")
print(text)
print("*** end ***")
texts = txtsplit(text)
v = voice.lower()
audios = []
for t in progress.tqdm(texts):
print(t)
audios.append(styletts2importable.inference(t, voices[v], alpha=0.3, beta=0.7, diffusion_steps=lngsteps, embedding_scale=1))
return (24000, np.concatenate(audios))
# def longsynthesize(text, voice, lngsteps, password, progress=gr.Progress()):
# if password == os.environ['ACCESS_CODE']:
# if text.strip() == "":
# raise gr.Error("You must enter some text")
# if lngsteps > 25:
# raise gr.Error("Max 25 steps")
# if lngsteps < 5:
# raise gr.Error("Min 5 steps")
# texts = split_and_recombine_text(text)
# v = voice.lower()
# audios = []
# for t in progress.tqdm(texts):
# audios.append(styletts2importable.inference(t, voices[v], alpha=0.3, beta=0.7, diffusion_steps=lngsteps, embedding_scale=1))
# return (24000, np.concatenate(audios))
# else:
# raise gr.Error('Wrong access code')
def clsynthesize(text, voice, vcsteps, embscale, alpha, beta, progress=gr.Progress()):
# if text.strip() == "":
# raise gr.Error("You must enter some text")
# # if global_phonemizer.phonemize([text]) > 300:
# if len(text) > 400:
# raise gr.Error("Text must be under 400 characters")
# # return (24000, styletts2importable.inference(text, styletts2importable.compute_style(voice), alpha=0.3, beta=0.7, diffusion_steps=20, embedding_scale=1))
# return (24000, styletts2importable.inference(text, styletts2importable.compute_style(voice), alpha=0.3, beta=0.7, diffusion_steps=vcsteps, embedding_scale=1))
if text.strip() == "":
raise gr.Error("You must enter some text")
if len(text) > 50000:
raise gr.Error("Text must be <50k characters")
if embscale > 1.3 and len(text) < 20:
gr.Warning("WARNING: You entered short text, you may get static!")
print("*** saying ***")
print(text)
print("*** end ***")
texts = txtsplit(text)
audios = []
# vs = styletts2importable.compute_style(voice)
vs = styletts2importable.compute_style(voice)
# print(vs)
for t in progress.tqdm(texts):
audios.append(styletts2importable.inference(t, vs, alpha=alpha, beta=beta, diffusion_steps=vcsteps, embedding_scale=embscale))
# audios.append(styletts2importable.inference(t, vs, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=5))
return (24000, np.concatenate(audios))
def ljsynthesize(text, steps, progress=gr.Progress()):
# if text.strip() == "":
# raise gr.Error("You must enter some text")
# # if global_phonemizer.phonemize([text]) > 300:
# if len(text) > 400:
# raise gr.Error("Text must be under 400 characters")
noise = torch.randn(1,1,256).to('cuda' if torch.cuda.is_available() else 'cpu')
# return (24000, Text-guided Inferenceimportable.inference(text, noise, diffusion_steps=7, embedding_scale=1))
if text.strip() == "":
raise gr.Error("You must enter some text")
if len(text) > 150000:
raise gr.Error("Text must be <150k characters")
print("*** saying ***")
print(text)
print("*** end ***")
texts = txtsplit(text)
audios = []
for t in progress.tqdm(texts):
audios.append(ljspeechimportable.inference(t, noise, diffusion_steps=steps, embedding_scale=1))
return (24000, np.concatenate(audios))
with gr.Blocks() as vctk:
with gr.Row():
with gr.Column(scale=1):
inp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-2', interactive=True)
multispeakersteps = gr.Slider(minimum=3, maximum=15, value=3, step=1, label="Diffusion Steps", info="Higher gives you more diverse results but not necessarily higher quality - これを増えたらもっとエモーショナルな結果になりますが、クオリティーのいい結果になるとは限らない。", interactive=True)
# use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
with gr.Column(scale=1):
btn = gr.Button("Synthesize", variant="primary")
audio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
btn.click(synthesize, inputs=[inp, voice, multispeakersteps], outputs=[audio], concurrency_limit=4)
with gr.Blocks() as clone:
with gr.Row():
with gr.Column(scale=1):
clinp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
clvoice = gr.Audio(label="Voice", interactive=True, type='filepath', max_length=300, waveform_options={'waveform_progress_color': '#3C82F6'})
vcsteps = gr.Slider(minimum=3, maximum=20, value=20, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
embscale = gr.Slider(minimum=1, maximum=10, value=1, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="Defaults to 1. WARNING: If you set this too high and generate text that's too short you will get static!", interactive=True)
alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", info="Defaults to 0.3", interactive=True)
beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1, label="Beta", info="Defaults to 0.7", interactive=True)
with gr.Column(scale=1):
clbtn = gr.Button("Synthesize", variant="primary")
claudio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
clbtn.click(clsynthesize, inputs=[clinp, clvoice, vcsteps, embscale, alpha, beta], outputs=[claudio], concurrency_limit=4)
# with gr.Blocks() as longText:
# with gr.Row():
# with gr.Column(scale=1):
# lnginp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
# lngvoice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-1', interactive=True)
# lngsteps = gr.Slider(minimum=5, maximum=25, value=10, step=1, label="Diffusion Steps", info="Higher = better quality, but slower", interactive=True)
# lngpwd = gr.Textbox(label="Access code", info="This feature is in beta. You need an access code to use it as it uses more resources and we would like to prevent abuse")
# with gr.Column(scale=1):
# lngbtn = gr.Button("Synthesize", variant="primary")
# lngaudio = gr.Audio(interactive=False, label="Synthesized Audio")
# lngbtn.click(longsynthesize, inputs=[lnginp, lngvoice, lngsteps, lngpwd], outputs=[lngaudio], concurrency_limit=4)
with gr.Blocks() as lj:
with gr.Row():
with gr.Column(scale=1):
ljinp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
ljsteps = gr.Slider(minimum=3, maximum=20, value=3, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
with gr.Column(scale=1):
ljbtn = gr.Button("Synthesize", variant="primary")
ljaudio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
ljbtn.click(ljsynthesize, inputs=[ljinp, ljsteps], outputs=[ljaudio], concurrency_limit=4)
with gr.Blocks(title="StyleTTS 2", css="footer{display:none !important}", theme=theme) as demo:
gr.Markdown(INTROTXT)
gr.DuplicateButton("Duplicate Space")
# gr.TabbedInterface([vctk, clone, lj, longText], ['Multi-Voice', 'Voice Cloning', 'Text-guided Inference', 'Long Text [Beta]'])
gr.TabbedInterface([vctk, clone, lj], ['Multi-Voice', 'do not use this option','Text-guided Inference', 'Long Text [Beta]'])
gr.Markdown("""
Demo by [mrfakename](https://twitter.com/realmrfakename). I am not affiliated with the StyleTTS 2 authors.
Run this demo locally using Docker:
```bash
docker run -it -p 7860:7860 --platform=linux/amd64 --gpus all registry.hf.space/styletts2-styletts2:latest python app.py
```
""") # Please do not remove this line.
if __name__ == "__main__":
# demo.queue(api_open=False, max_size=15).launch(show_api=False)
demo.queue(api_open=False, max_size=15).launch(show_api=False)
|