INTROTXT = """# StyleTTS 2 kudos to mrfakename for the base gradio code I'm borrowing here. 日本語用 You will probably experience slight artifacts at the beginning or at the end of the output, which is not there on my server. Unfortunately, due to the variation in how floating-point operations are performed across different devices, and given the intrinsic characteristics of models that incorporate diffusion components, it is unlikely that you will achieve identical results to those obtained on my server, where the model was originally trained. So, the output you're about to hear may not accurately reflect the true performance of the model. ========= 残念ながら、異なるデバイスで浮動小数点演算が異なる方法で行われるため、およびDiffusionコンポーネントを取り入れたモデルの固有の特性を考慮すると、 モデルが元々トレーニングされたデバイスで得られた結果と同じ結果を得ることは難しいでしょう。 その結果、以下で体験するパフォーマンスはモデルの真の性能を正確に反映していません。 そのため、音声の開始時または終了時にアーティファクトが発生する可能性があります。 ** """ import gradio as gr import styletts2importable import ljspeechimportable import torch import os from txtsplit import txtsplit import numpy as np import pickle theme = gr.themes.Base( font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'], ) voicelist = ['VO_JA_Kamisato_Ayaka_About_Kujou_Sara','hontonokimochi','gaen_original'] voices = {} # import phonemizer # global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True) # todo: cache computed style, load using pickle # if os.path.exists('voices.pkl'): # with open('voices.pkl', 'rb') as f: # voices = pickle.load(f) # else: for v in voicelist: voices[v] = styletts2importable.compute_style(f'voices/{v}.wav') # def synthesize(text, voice, multispeakersteps): # if text.strip() == "": # raise gr.Error("You must enter some text") # # if len(global_phonemizer.phonemize([text])) > 300: # if len(text) > 300: # raise gr.Error("Text must be under 300 characters") # v = voice.lower() # # return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=7, embedding_scale=1)) # return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=multispeakersteps, embedding_scale=1)) if not torch.cuda.is_available(): INTROTXT += "\n\n### on CPU, it'll run rather slower, but not too much." def synthesize(text, voice, lngsteps, password, progress=gr.Progress()): if text.strip() == "": raise gr.Error("You must enter some text") if len(text) > 50000: raise gr.Error("Text must be <50k characters") print("*** saying ***") print(text) print("*** end ***") texts = txtsplit(text) v = voice.lower() audios = [] for t in progress.tqdm(texts): print(t) audios.append(styletts2importable.inference(t, voices[v], alpha=0.3, beta=0.7, diffusion_steps=lngsteps, embedding_scale=1)) return (24000, np.concatenate(audios)) # def longsynthesize(text, voice, lngsteps, password, progress=gr.Progress()): # if password == os.environ['ACCESS_CODE']: # if text.strip() == "": # raise gr.Error("You must enter some text") # if lngsteps > 25: # raise gr.Error("Max 25 steps") # if lngsteps < 5: # raise gr.Error("Min 5 steps") # texts = split_and_recombine_text(text) # v = voice.lower() # audios = [] # for t in progress.tqdm(texts): # audios.append(styletts2importable.inference(t, voices[v], alpha=0.3, beta=0.7, diffusion_steps=lngsteps, embedding_scale=1)) # return (24000, np.concatenate(audios)) # else: # raise gr.Error('Wrong access code') def clsynthesize(text, voice, vcsteps, embscale, alpha, beta, progress=gr.Progress()): # if text.strip() == "": # raise gr.Error("You must enter some text") # # if global_phonemizer.phonemize([text]) > 300: # if len(text) > 400: # raise gr.Error("Text must be under 400 characters") # # return (24000, styletts2importable.inference(text, styletts2importable.compute_style(voice), alpha=0.3, beta=0.7, diffusion_steps=20, embedding_scale=1)) # return (24000, styletts2importable.inference(text, styletts2importable.compute_style(voice), alpha=0.3, beta=0.7, diffusion_steps=vcsteps, embedding_scale=1)) if text.strip() == "": raise gr.Error("You must enter some text") if len(text) > 50000: raise gr.Error("Text must be <50k characters") if embscale > 1.3 and len(text) < 20: gr.Warning("WARNING: You entered short text, you may get static!") print("*** saying ***") print(text) print("*** end ***") texts = txtsplit(text) audios = [] # vs = styletts2importable.compute_style(voice) vs = styletts2importable.compute_style(voice) # print(vs) for t in progress.tqdm(texts): audios.append(styletts2importable.inference(t, vs, alpha=alpha, beta=beta, diffusion_steps=vcsteps, embedding_scale=embscale)) # audios.append(styletts2importable.inference(t, vs, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=5)) return (24000, np.concatenate(audios)) def ljsynthesize(text, steps,embscale, progress=gr.Progress()): # if text.strip() == "": # raise gr.Error("You must enter some text") # # if global_phonemizer.phonemize([text]) > 300: # if len(text) > 400: # raise gr.Error("Text must be under 400 characters") noise = torch.randn(1,1,256).to('cuda' if torch.cuda.is_available() else 'cpu') # return (24000, Text-guided Inferenceimportable.inference(text, noise, diffusion_steps=7, embedding_scale=1)) if text.strip() == "": raise gr.Error("You must enter some text") if len(text) > 150000: raise gr.Error("Text must be <150k characters") print("*** saying ***") print(text) print("*** end ***") texts = txtsplit(text) audios = [] for t in progress.tqdm(texts): audios.append(ljspeechimportable.inference(t, noise, diffusion_steps=steps, embedding_scale=embscale)) return (24000, np.concatenate(audios)) with gr.Blocks() as vctk: with gr.Row(): with gr.Column(scale=1): inp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True) voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-2', interactive=True) multispeakersteps = gr.Slider(minimum=3, maximum=15, value=3, step=1, label="Diffusion Steps", info="これを増えたらもっとエモーショナルな結果になりますが、クオリティーのいい結果になるとは限らない。", interactive=True) alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", info="Defaults to 0.3", interactive=True) beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1, label="Beta", info="Defaults to 0.7", interactive=True) # use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental") with gr.Column(scale=1): btn = gr.Button("Synthesize", variant="primary") audio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'}) btn.click(synthesize, inputs=[inp, voice, multispeakersteps,alpha,beta], outputs=[audio], concurrency_limit=4) with gr.Blocks() as clone: with gr.Row(): with gr.Column(scale=1): clinp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True) clvoice = gr.Audio(label="Voice", interactive=True, type='filepath', max_length=300, waveform_options={'waveform_progress_color': '#3C82F6'}) vcsteps = gr.Slider(minimum=3, maximum=20, value=20, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True) embscale = gr.Slider(minimum=1, maximum=10, value=1, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="Defaults to 1. WARNING: If you set this too high and generate text that's too short you will get static!", interactive=True) alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", info="Defaults to 0.3", interactive=True) beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1, label="Beta", info="Defaults to 0.7", interactive=True) with gr.Column(scale=1): clbtn = gr.Button("Synthesize", variant="primary") claudio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'}) clbtn.click(clsynthesize, inputs=[clinp, clvoice, vcsteps, embscale, alpha, beta], outputs=[claudio], concurrency_limit=4) # with gr.Blocks() as longText: # with gr.Row(): # with gr.Column(scale=1): # lnginp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True) # lngvoice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-1', interactive=True) # lngsteps = gr.Slider(minimum=5, maximum=25, value=10, step=1, label="Diffusion Steps", info="Higher = better quality, but slower", interactive=True) # lngpwd = gr.Textbox(label="Access code", info="This feature is in beta. You need an access code to use it as it uses more resources and we would like to prevent abuse") # with gr.Column(scale=1): # lngbtn = gr.Button("Synthesize", variant="primary") # lngaudio = gr.Audio(interactive=False, label="Synthesized Audio") # lngbtn.click(longsynthesize, inputs=[lnginp, lngvoice, lngsteps, lngpwd], outputs=[lngaudio], concurrency_limit=4) with gr.Blocks() as lj: with gr.Row(): with gr.Column(scale=1): ljinp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True, value="あなたがいないと、世界は色褪せて見えます。あなたの笑顔が私の日々を明るく照らしています。あなたがいない日は、まるで冬のように寒く、暗いです.") embscale = gr.Slider(minimum=1, maximum=10, value=1, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="Defaults to 1. これを上げたらパフォーマンスがもっとエモーショナルになる、増やしすぎるとだめになるので、ご注意ください", interactive=True) ljsteps = gr.Slider(minimum=3, maximum=20, value=3, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True) with gr.Column(scale=1): ljbtn = gr.Button("Synthesize", variant="primary") ljaudio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'}) ljbtn.click(ljsynthesize, inputs=[ljinp, ljsteps, embscale], outputs=[ljaudio], concurrency_limit=4) with gr.Blocks(title="StyleTTS 2", css="footer{display:none !important}", theme="NoCrypt/miku") as demo: gr.Markdown(INTROTXT) gr.DuplicateButton("Duplicate Space") # gr.TabbedInterface([vctk, clone, lj, longText], ['Multi-Voice', 'Voice Cloning', 'Text-guided Inference', 'Long Text [Beta]']) gr.TabbedInterface([vctk, clone, lj], ['With Reference Audio', '|do not use this option|','Text-guided Inference', 'Long Text [Beta]']) gr.Markdown(""" the base code was borrowed from -> [mrfakename](https://twitter.com/realmrfakename). Neither of use are affiliated with the StyleTTS 2 authors. """) # Please do not remove this line. if __name__ == "__main__": # demo.queue(api_open=False, max_size=15).launch(show_api=False) demo.queue(api_open=False, max_size=15).launch(show_api=False)