import re import gradio import tqdm from bark.api import * from .bark_generation import generate_text_semantic_new, generate_coarse_new, generate_fine_new, codec_decode_new, SAMPLE_RATE from typing import Union def text_to_semantic_new( text: str, history_prompt: Optional[Union[str, dict]] = None, temp: float = 0.7, silent: bool = False, allow_early_stop: bool = True, min_eos_p: float = 0.2, progress=gradio.Progress() ): """Generate semantic array from text. Args: text: text to be turned into audio history_prompt: history choice for audio cloning temp: generation temperature (1.0 more diverse, 0.0 more conservative) silent: disable progress bar allow_early_stop: (Added in new) set to False to generate until the limit min_eos_p: (Added in new) Generation stopping likelyness, Lower means more likely to stop. progress: (Added in new) Gradio progress bar. Returns: numpy semantic array to be fed into `semantic_to_waveform` """ x_semantic = generate_text_semantic_new( text, history_prompt=history_prompt, temp=temp, silent=silent, use_kv_caching=True, allow_early_stop=allow_early_stop, min_eos_p=min_eos_p, progress=progress ) return x_semantic def semantic_to_waveform_new( semantic_tokens: np.ndarray, history_prompt: Optional[Union[str, dict]] = None, temp: float = 0.7, silent: bool = False, output_full: bool = False, skip_fine: bool = False, decode_on_cpu: bool = False, progress=gradio.Progress() ): """Generate audio array from semantic input. Args: semantic_tokens: semantic token output from `text_to_semantic` history_prompt: history choice for audio cloning temp: generation temperature (1.0 more diverse, 0.0 more conservative) silent: disable progress bar output_full: return full generation to be used as a history prompt skip_fine: (Added in new) Skip converting coarse to fine decode_on_cpu: (Added in new) Move everything to cpu when decoding, useful for decoding huge audio files on medium vram progress: (Added in new) Gradio progress bar. Returns: numpy audio array at sample frequency 24khz """ coarse_tokens = generate_coarse_new( semantic_tokens, history_prompt=history_prompt, temp=temp, silent=silent, use_kv_caching=True, progress=progress ) if not skip_fine: fine_tokens = generate_fine_new( coarse_tokens, history_prompt=history_prompt, temp=0.5, progress=progress ) else: fine_tokens = coarse_tokens audio_arr = codec_decode_new(fine_tokens, decode_on_cpu) if output_full: full_generation = { "semantic_prompt": semantic_tokens, "coarse_prompt": coarse_tokens, "fine_prompt": fine_tokens, } return full_generation, audio_arr return audio_arr def strict_split(string: str, regex='([.,:;!?\\n])'): splits = re.split(regex, string) splits_out = [] last = '' for idx, split in enumerate(splits): if idx % 2 == 0: last = split else: last += split splits_out.append(last) if len(splits_out) == 0 or not splits_out[-1] == last: splits_out.append(last) return splits_out def non_strict_split(string: str): return strict_split(string, '(\\.)') def long_merge(splits: list[str]): limit = 220 # Estimated for normal speaking speed out_list = [] current_str = '' for split in splits: if len(current_str) + len(split) <= limit: current_str += split else: if current_str: out_list.append(current_str) current_str = split if current_str: out_list.append(current_str) return out_list def strict_short(string): return strict_split(string) def strict_long(string): return long_merge(strict_split(string)) def non_strict_short(string): return non_strict_split(string) def non_strict_long(string): return long_merge(non_strict_split(string)) def generate_audio_new( text: str, history_prompt: Optional[Union[str, dict]] = None, text_temp: float = 0.7, waveform_temp: float = 0.7, silent: bool = False, output_full: bool = False, skip_fine: bool = False, decode_on_cpu: bool = False, allow_early_stop: bool = True, min_eos_p: float = 0.2, long_gen_silence_secs: float = 0, long_gen_re_feed: bool = True, gen_prefix: str = '', split_type: str = 'Manual', progress=gradio.Progress() ): """Generate audio array from input text. Args: text: text to be turned into audio history_prompt: history choice for audio cloning text_temp: generation temperature (1.0 more diverse, 0.0 more conservative) waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative) silent: disable progress bar output_full: return full generation to be used as a history prompt skip_fine: (Added in new) Skip converting from coarse to fine decode_on_cpu: (Added in new) Decode on cpu allow_early_stop: (Added in new) Set to false to continue until the limit is reached min_eos_p: (Added in new) Lower values stop the generation earlier. long_gen_silence_secs: (Added in new) The amount of silence between clips for long form generations. long_gen_re_feed: (Added in new) For longer generations (\n) use the last generated chunk as the prompt for the next. Better continuation at risk of changing voice. gen_prefix: (Added in new) A prefix to add to every single generated chunk. split_type: (Added in new) The way to split the clips. progress: (Added in new) Gradio progress bar. Returns: numpy audio array at sample frequency 24khz """ if gen_prefix: gen_prefix = gen_prefix + ' ' silence = np.zeros(int(long_gen_silence_secs * SAMPLE_RATE)) gen_audio = [] if text: match split_type.casefold(): case 'manual': gen_sections = text.strip().split('\n') case 'strict short': gen_sections = strict_short(text) case 'strict long': gen_sections = strict_long(text) case 'non-strict short': gen_sections = non_strict_short(text) case 'non-strict long': gen_sections = non_strict_long(text) case _: print('??? Unknown split method selected. Not splitting.') gen_sections = [text] else: gen_sections = [text] print('Generation split into sections:', gen_sections) for input_text in tqdm.tqdm(gen_sections, desc='Generation section'): input_text = gen_prefix + input_text semantic_tokens = text_to_semantic_new( input_text, history_prompt=history_prompt, temp=text_temp, silent=silent, allow_early_stop=allow_early_stop, min_eos_p=min_eos_p, progress=progress ) out = semantic_to_waveform_new( semantic_tokens, history_prompt=history_prompt, temp=waveform_temp, silent=silent, output_full=True, skip_fine=skip_fine, decode_on_cpu=decode_on_cpu, progress=progress ) full_generation, gen_audio_new = out if long_gen_re_feed: history_prompt = full_generation gen_audio += [gen_audio_new, silence.copy()] gen_audio = np.concatenate(gen_audio) if output_full: return full_generation, gen_audio return gen_audio