File size: 4,256 Bytes
468fe55
 
e7ab3a1
 
468fe55
 
 
 
 
 
 
 
 
 
 
 
78b1e75
 
 
468fe55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78b1e75
468fe55
 
78b1e75
 
 
 
468fe55
78b1e75
468fe55
78b1e75
 
 
 
 
468fe55
78b1e75
468fe55
78b1e75
468fe55
78b1e75
 
 
 
 
468fe55
78b1e75
468fe55
78b1e75
468fe55
 
 
 
 
78b1e75
468fe55
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import time

import streamlit as st

from zerovox.tts.synthesize import ZeroVoxTTS
from zerovox.g2p.g2p import DEFAULT_G2P_MODEL_NAME_DE, DEFAULT_G2P_MODEL_NAME_EN

SAMPLE_RATE=24000 # FIXME
TTS_MODEL_NAME='tts_en_de_zerovox_alpha1'

if "text" not in st.session_state:
    st.session_state.text = "Welcome to the world of speech synthesis!"

if "message" not in st.session_state:
    st.session_state.message = "READY."

if "autoplay" not in st.session_state:
    st.session_state.autoplay = False

def update_text_input():
    global text
    if st.session_state['lang'] == "en":
        st.session_state.text = "Welcome to the world of speech synthesis!"
    elif st.session_state['lang'] == "de":
        st.session_state.text = "Willkommen in der Welt der Sprachsynthese!"
    text = st.session_state.text

def do_synth():

    global lang, status, speakerref, text, playback, meldec


    synth = None
    if 'synth' in st.session_state:
        synth = st.session_state.synth

        if synth.language != lang or synth.meldec_model != st.session_state['meldec']:
            synth = None # trigger reload

    if not synth:

        status.update(label="loading the model...", state="running")

        g2p_model = DEFAULT_G2P_MODEL_NAME_DE if lang=='de' else DEFAULT_G2P_MODEL_NAME_EN

        st.session_state.modelcfg, st.session_state.synth = ZeroVoxTTS.load_model(TTS_MODEL_NAME,
                                                                                  g2p=g2p_model,
                                                                                  lang=lang,
                                                                                  meldec_model=st.session_state['meldec'],
                                                                                  infer_device='cpu',
                                                                                  num_threads=-1,
                                                                                  do_compile=False,
                                                                                  verbose=False)

    synth = st.session_state.synth
    modelcfg = st.session_state.modelcfg

    status.update(label="computing speaker embedding...", state="running")

    spkemb = synth.speaker_embed(ZeroVoxTTS.get_speakerref(speakerref, modelcfg['audio']['sampling_rate']))    

    status.update(label="synthesizing...", state="running")

    start_time = time.time()

    wav, phoneme, length = synth.tts(text, spkemb)

    elapsed_time = time.time() - start_time

    message = f"synth time: {elapsed_time:.2f} sec"
    wav_len = wav.shape[0] / modelcfg['audio']['sampling_rate']
    message += f", voice length: {wav_len:.2f} sec"
    real_time_factor = wav_len / elapsed_time
    message += f", rtf: {real_time_factor:.2f}"

    st.session_state.message = message

    #status.update(label=message, state="complete")

    st.session_state.wav = wav
    st.session_state.autoplay = True
    #playback.audio(wav, sample_rate=SAMPLE_RATE)

st.markdown("# ZeroVOX TTS Demo\n\nZeroVOX is a zero-shot realtime TTS system, fully offline, free and open source.\n\nFor more information, check out\n[https://github.com/gooofy/zerovox](https://github.com/gooofy/zerovox)\n")

col1, col2 = st.columns(2)

with col1:
    lang = st.selectbox("Language",
                        ["en", "de"],
                        on_change=update_text_input,
                        key='lang')

    text = st.text_input("Text to synthesize", value=st.session_state.text)

    st.button("Synthesize!", type="primary", on_click=do_synth)

with col2:
    meldec = st.selectbox("MEL decoder",
                        ["meldec-libritts-multi-band-melgan-v2", "meldec-libritts-hifigan-v1"],
                        #on_change=update_text_input,
                        key='meldec')

    speakerref = st.selectbox("Voice sample", ZeroVoxTTS.available_speakerrefs())

    st.audio(ZeroVoxTTS.get_speakerref(speakerref, SAMPLE_RATE), sample_rate=SAMPLE_RATE)

status = st.status(st.session_state.message, state="complete")

if 'wav' in st.session_state:

    playback = st.audio(st.session_state.wav, sample_rate=SAMPLE_RATE, autoplay=st.session_state.autoplay)

else:

    playback = st.empty()