Spaces:

mrtroydev
/

audio-webui

No application file

File size: 7,974 Bytes

3883c60

import re

import gradio
import tqdm
from bark.api import *
from .bark_generation import generate_text_semantic_new, generate_coarse_new, generate_fine_new, codec_decode_new, SAMPLE_RATE
from typing import Union


def text_to_semantic_new(
    text: str,
    history_prompt: Optional[Union[str, dict]] = None,
    temp: float = 0.7,
    silent: bool = False,
    allow_early_stop: bool = True,
    min_eos_p: float = 0.2,
    progress=gradio.Progress()
):
    """Generate semantic array from text.

    Args:
        text: text to be turned into audio
        history_prompt: history choice for audio cloning
        temp: generation temperature (1.0 more diverse, 0.0 more conservative)
        silent: disable progress bar
        allow_early_stop: (Added in new) set to False to generate until the limit
        min_eos_p: (Added in new) Generation stopping likelyness, Lower means more likely to stop.
        progress: (Added in new) Gradio progress bar.

    Returns:
        numpy semantic array to be fed into `semantic_to_waveform`
    """
    x_semantic = generate_text_semantic_new(
        text,
        history_prompt=history_prompt,
        temp=temp,
        silent=silent,
        use_kv_caching=True,
        allow_early_stop=allow_early_stop,
        min_eos_p=min_eos_p,
        progress=progress
    )
    return x_semantic


def semantic_to_waveform_new(
    semantic_tokens: np.ndarray,
    history_prompt: Optional[Union[str, dict]] = None,
    temp: float = 0.7,
    silent: bool = False,
    output_full: bool = False,
    skip_fine: bool = False,
    decode_on_cpu: bool = False,
    progress=gradio.Progress()
):
    """Generate audio array from semantic input.

    Args:
        semantic_tokens: semantic token output from `text_to_semantic`
        history_prompt: history choice for audio cloning
        temp: generation temperature (1.0 more diverse, 0.0 more conservative)
        silent: disable progress bar
        output_full: return full generation to be used as a history prompt
        skip_fine: (Added in new) Skip converting coarse to fine
        decode_on_cpu: (Added in new) Move everything to cpu when decoding, useful for decoding huge audio files on medium vram
        progress: (Added in new) Gradio progress bar.

    Returns:
        numpy audio array at sample frequency 24khz
    """
    coarse_tokens = generate_coarse_new(
        semantic_tokens,
        history_prompt=history_prompt,
        temp=temp,
        silent=silent,
        use_kv_caching=True,
        progress=progress
    )
    if not skip_fine:
        fine_tokens = generate_fine_new(
            coarse_tokens,
            history_prompt=history_prompt,
            temp=0.5,
            progress=progress
        )
    else:
        fine_tokens = coarse_tokens
    audio_arr = codec_decode_new(fine_tokens, decode_on_cpu)
    if output_full:
        full_generation = {
            "semantic_prompt": semantic_tokens,
            "coarse_prompt": coarse_tokens,
            "fine_prompt": fine_tokens,
        }
        return full_generation, audio_arr
    return audio_arr



def strict_split(string: str, regex='([.,:;!?\\n])'):
    splits = re.split(regex, string)
    splits_out = []
    last = ''
    for idx, split in enumerate(splits):
        if idx % 2 == 0:
            last = split
        else:
            last += split
            splits_out.append(last)

    if len(splits_out) == 0 or not splits_out[-1] == last:
        splits_out.append(last)

    return splits_out


def non_strict_split(string: str):
    return strict_split(string, '(\\.)')


def long_merge(splits: list[str]):
    limit = 220  # Estimated for normal speaking speed

    out_list = []
    current_str = ''

    for split in splits:
        if len(current_str) + len(split) <= limit:
            current_str += split
        else:
            if current_str:
                out_list.append(current_str)
            current_str = split

    if current_str:
        out_list.append(current_str)

    return out_list


def strict_short(string):
    return strict_split(string)


def strict_long(string):
    return long_merge(strict_split(string))


def non_strict_short(string):
    return non_strict_split(string)


def non_strict_long(string):
    return long_merge(non_strict_split(string))


def generate_audio_new(
    text: str,
    history_prompt: Optional[Union[str, dict]] = None,
    text_temp: float = 0.7,
    waveform_temp: float = 0.7,
    silent: bool = False,
    output_full: bool = False,
    skip_fine: bool = False,
    decode_on_cpu: bool = False,
    allow_early_stop: bool = True,
    min_eos_p: float = 0.2,
    long_gen_silence_secs: float = 0,
    long_gen_re_feed: bool = True,
    gen_prefix: str = '',
    split_type: str = 'Manual',
    progress=gradio.Progress()
):
    """Generate audio array from input text.

    Args:
        text: text to be turned into audio
        history_prompt: history choice for audio cloning
        text_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
        waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
        silent: disable progress bar
        output_full: return full generation to be used as a history prompt
        skip_fine: (Added in new) Skip converting from coarse to fine
        decode_on_cpu: (Added in new) Decode on cpu
        allow_early_stop: (Added in new) Set to false to continue until the limit is reached
        min_eos_p: (Added in new) Lower values stop the generation earlier.
        long_gen_silence_secs: (Added in new) The amount of silence between clips for long form generations.
        long_gen_re_feed: (Added in new) For longer generations (\n) use the last generated chunk as the prompt for the next. Better continuation at risk of changing voice.
        gen_prefix: (Added in new) A prefix to add to every single generated chunk.
        split_type: (Added in new) The way to split the clips.
        progress: (Added in new) Gradio progress bar.

    Returns:
        numpy audio array at sample frequency 24khz
    """
    if gen_prefix:
        gen_prefix = gen_prefix + ' '

    silence = np.zeros(int(long_gen_silence_secs * SAMPLE_RATE))
    gen_audio = []
    if text:
        match split_type.casefold():
            case 'manual':
                gen_sections = text.strip().split('\n')
            case 'strict short':
                gen_sections = strict_short(text)
            case 'strict long':
                gen_sections = strict_long(text)
            case 'non-strict short':
                gen_sections = non_strict_short(text)
            case 'non-strict long':
                gen_sections = non_strict_long(text)
            case _:
                print('??? Unknown split method selected. Not splitting.')
                gen_sections = [text]
    else:
        gen_sections = [text]
    print('Generation split into sections:', gen_sections)
    for input_text in tqdm.tqdm(gen_sections, desc='Generation section'):
        input_text = gen_prefix + input_text
        semantic_tokens = text_to_semantic_new(
            input_text,
            history_prompt=history_prompt,
            temp=text_temp,
            silent=silent,
            allow_early_stop=allow_early_stop,
            min_eos_p=min_eos_p,
            progress=progress
        )
        out = semantic_to_waveform_new(
            semantic_tokens,
            history_prompt=history_prompt,
            temp=waveform_temp,
            silent=silent,
            output_full=True,
            skip_fine=skip_fine,
            decode_on_cpu=decode_on_cpu,
            progress=progress
        )
        full_generation, gen_audio_new = out
        if long_gen_re_feed:
            history_prompt = full_generation
        gen_audio += [gen_audio_new, silence.copy()]

    gen_audio = np.concatenate(gen_audio)

    if output_full:
        return full_generation, gen_audio
    return gen_audio