File size: 6,197 Bytes
a2d19e9
468fe55
a2d19e9
468fe55
e7ab3a1
 
468fe55
 
 
 
 
41889ec
 
3b61dcb
 
 
 
a2d19e9
 
 
468fe55
3b61dcb
468fe55
 
 
 
78b1e75
 
 
a2d19e9
 
 
 
 
 
 
 
 
468fe55
 
 
3b61dcb
468fe55
3b61dcb
468fe55
 
 
a2d19e9
468fe55
 
 
 
 
f099617
468fe55
f099617
a2d19e9
 
 
468fe55
 
 
 
 
 
 
f099617
468fe55
 
 
 
 
 
1a38443
468fe55
 
 
 
 
 
a2d19e9
 
 
 
 
 
468fe55
 
 
 
 
b8c03ff
468fe55
 
 
 
 
 
 
 
 
 
 
 
78b1e75
 
a2d19e9
 
78b1e75
468fe55
a2d19e9
468fe55
b8c03ff
468fe55
a2d19e9
 
 
468fe55
a2d19e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
468fe55
41889ec
b8c03ff
78b1e75
f6feaaf
78b1e75
 
468fe55
b8c03ff
468fe55
a2d19e9
 
 
 
 
 
 
 
 
468fe55
b8c03ff
468fe55
 
 
78b1e75
468fe55
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import tempfile
import time
import librosa

import streamlit as st

from zerovox.tts.synthesize import ZeroVoxTTS
from zerovox.g2p.g2p import DEFAULT_G2P_MODEL_NAME_DE, DEFAULT_G2P_MODEL_NAME_EN

SAMPLE_RATE=24000 # FIXME

DEFAULT_SPEAKER = 'en_speaker_00061.wav'

SAMPLE_SENTENCE_EN = "A rainbow is an optical phenomenon caused by refraction, internal reflection and dispersion of light in water droplets resulting in a continuous spectrum of light appearing in the sky."
#SAMPLE_SENTENCE_EN = "Welcome to the world of speech synthesis!"
SAMPLE_SENTENCE_DE = "Der Regenbogen ist ein atmosphärisch-optisches Phänomen, das als kreisbogenförmiges farbiges Lichtband in einem von der Sonne beschienenen Regenschauer erscheint."

if "lang" not in st.session_state:
    st.session_state.lang = "en"

if "text" not in st.session_state:
    st.session_state.text = SAMPLE_SENTENCE_EN

if "message" not in st.session_state:
    st.session_state.message = "READY."

if "autoplay" not in st.session_state:
    st.session_state.autoplay = False

if "speakerref" not in st.session_state:
    st.session_state.speakerref = DEFAULT_SPEAKER

if "custom_voice" not in st.session_state:
    st.session_state.custom_voice = False

if "voice_wav" not in st.session_state:
    st.session_state.voice_wav = None

def update_text_input():
    global text
    if st.session_state['lang'] == "en":
        st.session_state.text = SAMPLE_SENTENCE_EN
    elif st.session_state['lang'] == "de":
        st.session_state.text = SAMPLE_SENTENCE_DE

def do_synth():

    global status, playback, meldec

    synth = None
    if 'synth' in st.session_state:
        synth = st.session_state.synth

        if synth.meldec_model != st.session_state['meldec']:
            synth = None # trigger reload
        else:
            if synth.language != st.session_state.lang:
                status.update(label=f"loading the lexicon for {st.session_state.lang} ...", state="running")
                synth.language = st.session_state.lang

    if not synth:

        status.update(label="loading the model...", state="running")

        g2p_model = DEFAULT_G2P_MODEL_NAME_DE if lang=='de' else DEFAULT_G2P_MODEL_NAME_EN

        st.session_state.modelcfg, st.session_state.synth = ZeroVoxTTS.load_model(ZeroVoxTTS.get_default_model(),
                                                                                  g2p=g2p_model,
                                                                                  lang=lang,
                                                                                  meldec_model=st.session_state['meldec'],
                                                                                  infer_device='cpu',
                                                                                  num_threads=-1,
                                                                                  do_compile=False,
                                                                                  verbose=True)

    synth = st.session_state.synth
    modelcfg = st.session_state.modelcfg

    status.update(label="computing speaker embedding...", state="running")

    if not st.session_state.custom_voice or st.session_state.voice_wav is None:
        speakerref = ZeroVoxTTS.get_speakerref(st.session_state.speakerref, modelcfg['audio']['sampling_rate'])
    else:
        speakerref = st.session_state.voice_wav

    spkemb = synth.speaker_embed(speakerref)    

    status.update(label="synthesizing...", state="running")

    start_time = time.time()

    wav, phoneme, length = synth.tts(st.session_state.text, spkemb)

    elapsed_time = time.time() - start_time

    message = f"synth time: {elapsed_time:.2f} sec"
    wav_len = wav.shape[0] / modelcfg['audio']['sampling_rate']
    message += f", voice length: {wav_len:.2f} sec"
    real_time_factor = wav_len / elapsed_time
    message += f", rtf: {real_time_factor:.2f}"

    st.session_state.message = message

    st.session_state.wav = wav
    st.session_state.autoplay = True

st.set_page_config(page_title="ZeroVOX TTS Demo", page_icon=':speech_balloon:', layout="centered", initial_sidebar_state="auto", menu_items=None)

st.markdown("# ZeroVOX TTS Demo\n\nZeroVOX is a zero-shot realtime TTS system, fully offline, free and open source.\n\nFor more information, check out\n[https://github.com/gooofy/zerovox](https://github.com/gooofy/zerovox)\n")

tab1, tab2 = st.tabs(["Voice", "MEL Decoder"])

with tab1:

    st.checkbox("Custom voice", key='custom_voice')

    speakerref = st.empty()

    if st.session_state.custom_voice:

        # Create a file uploader that accepts only .wav files
        uploaded_file = speakerref.file_uploader("Upload your voice sample", type=["wav"])

        # Process the uploaded file
        if uploaded_file is not None:
            with tempfile.NamedTemporaryFile() as f:
                f.write(uploaded_file.read())
                wav, sr = librosa.load(f.name, sr=SAMPLE_RATE)
            
            st.session_state.voice_wav=wav

            st.audio(wav, sample_rate=SAMPLE_RATE)

    else:

        speakers = [s for s in ZeroVoxTTS.available_speakerrefs()]
        speakerref.selectbox("Voice", speakers, key='speakerref')

        st.audio(ZeroVoxTTS.get_speakerref(st.session_state.speakerref, SAMPLE_RATE), sample_rate=SAMPLE_RATE)


with tab2:
    meldec = st.selectbox("MEL decoder",
                        ["meldec-libritts-hifigan-v1", "meldec-libritts-multi-band-melgan-v2"],
                        #on_change=update_text_input,
                        key='meldec')

status = st.status(st.session_state.message, state="complete")

col1, col2 = st.columns([0.8, 0.2])
with col1:
    text = st.text_input("Text to synthesize", key='text', on_change=do_synth)

with col2:
    lang = st.selectbox("Language",
                        ["en", "de"],
                        on_change=update_text_input,
                        key='lang')

st.button("Synthesize!", type="primary", on_click=do_synth)

if 'wav' in st.session_state:

    playback = st.audio(st.session_state.wav, sample_rate=SAMPLE_RATE, autoplay=st.session_state.autoplay)

else:

    playback = st.empty()