File size: 4,641 Bytes
468fe55
 
e7ab3a1
 
468fe55
 
 
 
 
41889ec
 
468fe55
 
 
 
 
 
78b1e75
 
 
468fe55
 
 
 
 
 
b8c03ff
468fe55
 
 
b8c03ff
468fe55
 
 
 
 
 
f099617
468fe55
f099617
 
 
 
468fe55
 
 
 
 
 
 
f099617
468fe55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8c03ff
468fe55
 
 
 
 
 
 
 
 
 
 
78b1e75
468fe55
 
78b1e75
 
 
 
468fe55
b8c03ff
468fe55
b8c03ff
78b1e75
 
 
 
468fe55
41889ec
468fe55
41889ec
468fe55
41889ec
 
b8c03ff
78b1e75
 
 
 
468fe55
b8c03ff
468fe55
b8c03ff
 
468fe55
b8c03ff
468fe55
 
 
78b1e75
468fe55
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import time

import streamlit as st

from zerovox.tts.synthesize import ZeroVoxTTS
from zerovox.g2p.g2p import DEFAULT_G2P_MODEL_NAME_DE, DEFAULT_G2P_MODEL_NAME_EN

SAMPLE_RATE=24000 # FIXME

DEFAULT_SPEAKER = 'en_speaker_00061.wav'

if "text" not in st.session_state:
    st.session_state.text = "Welcome to the world of speech synthesis!"

if "message" not in st.session_state:
    st.session_state.message = "READY."

if "autoplay" not in st.session_state:
    st.session_state.autoplay = False

def update_text_input():
    global text
    if st.session_state['lang'] == "en":
        st.session_state.text = "Welcome to the world of speech synthesis!"
    elif st.session_state['lang'] == "de":
        st.session_state.text = "Willkommen in der Welt der Sprachsynthese!"
    #text = st.session_state.text

def do_synth():

    global lang, status, speakerref, playback, meldec


    synth = None
    if 'synth' in st.session_state:
        synth = st.session_state.synth

        if synth.meldec_model != st.session_state['meldec']:
            synth = None # trigger reload
        else:
            if synth.language != lang:
                status.update(label=f"loading the lexicon for {lang} ...", state="running")
                synth.language = lang

    if not synth:

        status.update(label="loading the model...", state="running")

        g2p_model = DEFAULT_G2P_MODEL_NAME_DE if lang=='de' else DEFAULT_G2P_MODEL_NAME_EN

        st.session_state.modelcfg, st.session_state.synth = ZeroVoxTTS.load_model(ZeroVoxTTS.get_default_model(),
                                                                                  g2p=g2p_model,
                                                                                  lang=lang,
                                                                                  meldec_model=st.session_state['meldec'],
                                                                                  infer_device='cpu',
                                                                                  num_threads=-1,
                                                                                  do_compile=False,
                                                                                  verbose=False)

    synth = st.session_state.synth
    modelcfg = st.session_state.modelcfg

    status.update(label="computing speaker embedding...", state="running")

    spkemb = synth.speaker_embed(ZeroVoxTTS.get_speakerref(speakerref, modelcfg['audio']['sampling_rate']))    

    status.update(label="synthesizing...", state="running")

    start_time = time.time()

    wav, phoneme, length = synth.tts(st.session_state.text, spkemb)

    elapsed_time = time.time() - start_time

    message = f"synth time: {elapsed_time:.2f} sec"
    wav_len = wav.shape[0] / modelcfg['audio']['sampling_rate']
    message += f", voice length: {wav_len:.2f} sec"
    real_time_factor = wav_len / elapsed_time
    message += f", rtf: {real_time_factor:.2f}"

    st.session_state.message = message

    #status.update(label=message, state="complete")

    st.session_state.wav = wav
    st.session_state.autoplay = True
    #playback.audio(wav, sample_rate=SAMPLE_RATE)

st.markdown("# ZeroVOX TTS Demo\n\nZeroVOX is a zero-shot realtime TTS system, fully offline, free and open source.\n\nFor more information, check out\n[https://github.com/gooofy/zerovox](https://github.com/gooofy/zerovox)\n")

tab1, tab2 = st.tabs(["Settings", "MEL Decoder"])

with tab1:
    lang = st.selectbox("Language",
                        ["en", "de"],
                        on_change=update_text_input,
                        key='lang')

    speakers = [s for s in ZeroVoxTTS.available_speakerrefs()]

    speakerref = st.selectbox("Voice sample", speakers, index=speakers.index(DEFAULT_SPEAKER))

    st.audio(ZeroVoxTTS.get_speakerref(speakerref, SAMPLE_RATE), sample_rate=SAMPLE_RATE)

with tab2:
    meldec = st.selectbox("MEL decoder",
                        ["meldec-libritts-multi-band-melgan-v2", "meldec-libritts-hifigan-v1"],
                        #on_change=update_text_input,
                        key='meldec')

status = st.status(st.session_state.message, state="complete")

#text = st.text_input("Text to synthesize", value=st.session_state.text, key='text', on_change=do_synth)
text = st.text_input("Text to synthesize", key='text', on_change=do_synth)

st.button("Synthesize!", type="primary", on_click=do_synth)

if 'wav' in st.session_state:

    playback = st.audio(st.session_state.wav, sample_rate=SAMPLE_RATE, autoplay=st.session_state.autoplay)

else:

    playback = st.empty()