File size: 7,974 Bytes
3883c60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
import re

import gradio
import tqdm
from bark.api import *
from .bark_generation import generate_text_semantic_new, generate_coarse_new, generate_fine_new, codec_decode_new, SAMPLE_RATE
from typing import Union


def text_to_semantic_new(
    text: str,
    history_prompt: Optional[Union[str, dict]] = None,
    temp: float = 0.7,
    silent: bool = False,
    allow_early_stop: bool = True,
    min_eos_p: float = 0.2,
    progress=gradio.Progress()
):
    """Generate semantic array from text.

    Args:
        text: text to be turned into audio
        history_prompt: history choice for audio cloning
        temp: generation temperature (1.0 more diverse, 0.0 more conservative)
        silent: disable progress bar
        allow_early_stop: (Added in new) set to False to generate until the limit
        min_eos_p: (Added in new) Generation stopping likelyness, Lower means more likely to stop.
        progress: (Added in new) Gradio progress bar.

    Returns:
        numpy semantic array to be fed into `semantic_to_waveform`
    """
    x_semantic = generate_text_semantic_new(
        text,
        history_prompt=history_prompt,
        temp=temp,
        silent=silent,
        use_kv_caching=True,
        allow_early_stop=allow_early_stop,
        min_eos_p=min_eos_p,
        progress=progress
    )
    return x_semantic


def semantic_to_waveform_new(
    semantic_tokens: np.ndarray,
    history_prompt: Optional[Union[str, dict]] = None,
    temp: float = 0.7,
    silent: bool = False,
    output_full: bool = False,
    skip_fine: bool = False,
    decode_on_cpu: bool = False,
    progress=gradio.Progress()
):
    """Generate audio array from semantic input.

    Args:
        semantic_tokens: semantic token output from `text_to_semantic`
        history_prompt: history choice for audio cloning
        temp: generation temperature (1.0 more diverse, 0.0 more conservative)
        silent: disable progress bar
        output_full: return full generation to be used as a history prompt
        skip_fine: (Added in new) Skip converting coarse to fine
        decode_on_cpu: (Added in new) Move everything to cpu when decoding, useful for decoding huge audio files on medium vram
        progress: (Added in new) Gradio progress bar.

    Returns:
        numpy audio array at sample frequency 24khz
    """
    coarse_tokens = generate_coarse_new(
        semantic_tokens,
        history_prompt=history_prompt,
        temp=temp,
        silent=silent,
        use_kv_caching=True,
        progress=progress
    )
    if not skip_fine:
        fine_tokens = generate_fine_new(
            coarse_tokens,
            history_prompt=history_prompt,
            temp=0.5,
            progress=progress
        )
    else:
        fine_tokens = coarse_tokens
    audio_arr = codec_decode_new(fine_tokens, decode_on_cpu)
    if output_full:
        full_generation = {
            "semantic_prompt": semantic_tokens,
            "coarse_prompt": coarse_tokens,
            "fine_prompt": fine_tokens,
        }
        return full_generation, audio_arr
    return audio_arr



def strict_split(string: str, regex='([.,:;!?\\n])'):
    splits = re.split(regex, string)
    splits_out = []
    last = ''
    for idx, split in enumerate(splits):
        if idx % 2 == 0:
            last = split
        else:
            last += split
            splits_out.append(last)

    if len(splits_out) == 0 or not splits_out[-1] == last:
        splits_out.append(last)

    return splits_out


def non_strict_split(string: str):
    return strict_split(string, '(\\.)')


def long_merge(splits: list[str]):
    limit = 220  # Estimated for normal speaking speed

    out_list = []
    current_str = ''

    for split in splits:
        if len(current_str) + len(split) <= limit:
            current_str += split
        else:
            if current_str:
                out_list.append(current_str)
            current_str = split

    if current_str:
        out_list.append(current_str)

    return out_list


def strict_short(string):
    return strict_split(string)


def strict_long(string):
    return long_merge(strict_split(string))


def non_strict_short(string):
    return non_strict_split(string)


def non_strict_long(string):
    return long_merge(non_strict_split(string))


def generate_audio_new(
    text: str,
    history_prompt: Optional[Union[str, dict]] = None,
    text_temp: float = 0.7,
    waveform_temp: float = 0.7,
    silent: bool = False,
    output_full: bool = False,
    skip_fine: bool = False,
    decode_on_cpu: bool = False,
    allow_early_stop: bool = True,
    min_eos_p: float = 0.2,
    long_gen_silence_secs: float = 0,
    long_gen_re_feed: bool = True,
    gen_prefix: str = '',
    split_type: str = 'Manual',
    progress=gradio.Progress()
):
    """Generate audio array from input text.

    Args:
        text: text to be turned into audio
        history_prompt: history choice for audio cloning
        text_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
        waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
        silent: disable progress bar
        output_full: return full generation to be used as a history prompt
        skip_fine: (Added in new) Skip converting from coarse to fine
        decode_on_cpu: (Added in new) Decode on cpu
        allow_early_stop: (Added in new) Set to false to continue until the limit is reached
        min_eos_p: (Added in new) Lower values stop the generation earlier.
        long_gen_silence_secs: (Added in new) The amount of silence between clips for long form generations.
        long_gen_re_feed: (Added in new) For longer generations (\n) use the last generated chunk as the prompt for the next. Better continuation at risk of changing voice.
        gen_prefix: (Added in new) A prefix to add to every single generated chunk.
        split_type: (Added in new) The way to split the clips.
        progress: (Added in new) Gradio progress bar.

    Returns:
        numpy audio array at sample frequency 24khz
    """
    if gen_prefix:
        gen_prefix = gen_prefix + ' '

    silence = np.zeros(int(long_gen_silence_secs * SAMPLE_RATE))
    gen_audio = []
    if text:
        match split_type.casefold():
            case 'manual':
                gen_sections = text.strip().split('\n')
            case 'strict short':
                gen_sections = strict_short(text)
            case 'strict long':
                gen_sections = strict_long(text)
            case 'non-strict short':
                gen_sections = non_strict_short(text)
            case 'non-strict long':
                gen_sections = non_strict_long(text)
            case _:
                print('??? Unknown split method selected. Not splitting.')
                gen_sections = [text]
    else:
        gen_sections = [text]
    print('Generation split into sections:', gen_sections)
    for input_text in tqdm.tqdm(gen_sections, desc='Generation section'):
        input_text = gen_prefix + input_text
        semantic_tokens = text_to_semantic_new(
            input_text,
            history_prompt=history_prompt,
            temp=text_temp,
            silent=silent,
            allow_early_stop=allow_early_stop,
            min_eos_p=min_eos_p,
            progress=progress
        )
        out = semantic_to_waveform_new(
            semantic_tokens,
            history_prompt=history_prompt,
            temp=waveform_temp,
            silent=silent,
            output_full=True,
            skip_fine=skip_fine,
            decode_on_cpu=decode_on_cpu,
            progress=progress
        )
        full_generation, gen_audio_new = out
        if long_gen_re_feed:
            history_prompt = full_generation
        gen_audio += [gen_audio_new, silence.copy()]

    gen_audio = np.concatenate(gen_audio)

    if output_full:
        return full_generation, gen_audio
    return gen_audio