In [None]:
!pip install seaborn #why didn't mamba or pip install work with this? 


In [1]:
import os
import time
#os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import seaborn as sns
from IPython.display import Audio
import nltk # we'll use this to split into sentences
import numpy as np

from bark_infinity.generation import (
 generate_text_semantic,
 preload_models,
 COARSE_RATE_HZ,
 SEMANTIC_RATE_HZ
)
from bark_infinity.api import semantic_to_waveform, set_seed
from bark_infinity import generate_audio, SAMPLE_RATE, save_as_prompt
from bark_infinity.generation import generate_coarse, generate_fine, generate_text_semantic, codec_decode

import numpy as np
from rich import pretty
from rich import inspect
import copy

from contextlib import contextmanager

def load_npz(filename):
 npz_data = np.load(filename)

 data_dict = {
 "semantic_prompt": npz_data["semantic_prompt"],
 "coarse_prompt": npz_data["coarse_prompt"],
 "fine_prompt": npz_data["fine_prompt"],
 }

 npz_data.close() 

 return data_dict


def resize_history_prompt(history_prompt, tokens=128, from_front=False):
 #semantic_to_coarse_ratio = 75 / 49.9
 semantic_to_coarse_ratio = COARSE_RATE_HZ / SEMANTIC_RATE_HZ

 semantic_prompt = history_prompt["semantic_prompt"]
 coarse_prompt = history_prompt["coarse_prompt"]
 fine_prompt = history_prompt["fine_prompt"]

 new_semantic_len = min(tokens, len(semantic_prompt))
 new_coarse_len = min(int(new_semantic_len * semantic_to_coarse_ratio), coarse_prompt.shape[1])
 
 new_fine_len = new_coarse_len

 if from_front:
 new_semantic_prompt = semantic_prompt[:new_semantic_len]
 new_coarse_prompt = coarse_prompt[:, :new_coarse_len]
 new_fine_prompt = fine_prompt[:, :new_fine_len]
 else:
 new_semantic_prompt = semantic_prompt[-new_semantic_len:]
 new_coarse_prompt = coarse_prompt[:, -new_coarse_len:]
 new_fine_prompt = fine_prompt[:, -new_fine_len:]

 return {
 "semantic_prompt": new_semantic_prompt,
 "coarse_prompt": new_coarse_prompt,
 "fine_prompt": new_fine_prompt,
 }

def show_history_prompt_size(history_prompt, token_samples=3, semantic_back_n=128, text="history_prompt"):

 semantic_prompt = history_prompt["semantic_prompt"]
 coarse_prompt = history_prompt["coarse_prompt"]
 fine_prompt = history_prompt["fine_prompt"]

 # compute the ratio for coarse and fine back_n
 ratio = 75 / 49.9
 coarse_and_fine_back_n = int(semantic_back_n * ratio)

 def show_array_front_back(arr, n, back_n):
 if n > 0:
 front = arr[:n].tolist()
 back = arr[-n:].tolist()

 mid = []
 if len(arr) > back_n + token_samples:
 mid = arr[-back_n-token_samples:-back_n+token_samples].tolist()

 if mid:
 return f"{front} ... <{back_n} from end> {mid} ... {back}"
 else:
 return f"{front} ... {back}"
 else:
 return ""

 print(f"\n{text}")
 print(f" {text} semantic_prompt: {semantic_prompt.shape}")
 print(f" Tokens: {show_array_front_back(semantic_prompt, token_samples, semantic_back_n)}")
 
 print(f" {text} coarse_prompt: {coarse_prompt.shape}")
 for row in coarse_prompt:
 print(f" Tokens: {show_array_front_back(row, token_samples, coarse_and_fine_back_n)}")
 
 print(f" {text} fine_prompt: {fine_prompt.shape}")
 #for row in fine_prompt:
 # print(f" Tokens: {show_array_front_back(row, token_samples, coarse_and_fine_back_n)}")


def show_history_prompt_size(history_prompt, token_samples=3, semantic_back_n=256, text="history_prompt"):

 semantic_prompt = history_prompt["semantic_prompt"]
 coarse_prompt = history_prompt["coarse_prompt"]
 fine_prompt = history_prompt["fine_prompt"]

 # compute the ratio for coarse and fine back_n
 ratio = 75 / 49.9
 coarse_and_fine_back_n = int(semantic_back_n * ratio)

 def show_array_front_back(arr, n, back_n):
 if n > 0:
 front = arr[:n].tolist()
 back = arr[-n:].tolist()

 mid_front = []
 mid_back = []
 if len(arr) > back_n + token_samples:
 mid_front = arr[-back_n-token_samples:-back_n].tolist()
 mid_back = arr[-back_n:-back_n+token_samples].tolist()

 if mid_front and mid_back:
 return f"{front} ... {mid_front} <{back_n} from end> {mid_back} ... {back}"
 else:
 return f"{front} ... {back}"
 else:
 return ""

 print(f"\n{text}")
 print(f" {text} semantic_prompt: {semantic_prompt.shape}")
 print(f" Tokens: {show_array_front_back(semantic_prompt, token_samples, semantic_back_n)}")
 
 print(f" {text} coarse_prompt: {coarse_prompt.shape}")
 for row in coarse_prompt:
 print(f" Tokens: {show_array_front_back(row, token_samples, coarse_and_fine_back_n)}")
 
 print(f" {text} fine_prompt: {fine_prompt.shape}")
 #for row in fine_prompt:
 # print(f" Tokens: {show_array_front_back(row, token_samples, coarse_and_fine_back_n)}")


def split_array_equally(array, num_parts):
 split_indices = np.linspace(0, len(array), num_parts + 1, dtype=int)
 return [array[split_indices[i]: split_indices[i + 1]].astype(np.int32) for i in range(num_parts)]




@contextmanager
def measure_time(text=None, index=None):
 start_time = time.time()
 yield
 elapsed_time = time.time() - start_time
 if index is not None and text is not None:
 text = f"{text} {index}"
 elif text is None:
 text = "Operation"
 
 time_finished = f"{text} Finished at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))}"
 print(f" -->{time_finished} in {elapsed_time} seconds")



def compare_history_prompts(hp1, hp2, text="history_prompt"):
 print(f"\nComparing {text}")
 for key in hp1.keys():
 # Compare only the parts of the arrays that have the same shape
 if hp1[key].shape != hp2[key].shape:
 print(f" {key} arrays have different shapes: {hp1[key].shape} vs {hp2[key].shape}.")
 min_size = min(hp1[key].shape[0], hp2[key].shape[0])

 if hp1[key].ndim == 1:
 hp1_part = hp1[key][-min_size:]
 hp2_part = hp2[key][-min_size:]
 else:
 min_size = min(hp1[key].shape[1], hp2[key].shape[1])
 hp1_part = hp1[key][:, -min_size:]
 hp2_part = hp2[key][:, -min_size:]
 
 print(f" Comparing the last {min_size} elements of each.")
 else:
 hp1_part = hp1[key]
 hp2_part = hp2[key]

 if np.array_equal(hp1_part, hp2_part):
 print(f" {key} arrays are exactly the same.")
 elif np.allclose(hp1_part, hp2_part):
 diff = np.linalg.norm(hp1_part - hp2_part)
 print(f" {key} arrays are almost equal with a norm of difference: {diff}")
 else:
 diff = np.linalg.norm(hp1_part - hp2_part)
 print(f" {key} arrays are not equal. Norm of difference: {diff}")


def split_by_words(text, word_group_size):
 words = text.split()
 result = []
 group = ""
 
 for i, word in enumerate(words):
 group += word + " "
 
 if (i + 1) % word_group_size == 0:
 result.append(group.strip())
 group = ""
 
 # Add the last group if it's not empty
 if group.strip():
 result.append(group.strip())
 
 return result

def concat_history_prompts(history_prompt1, history_prompt2):
 new_semantic_prompt = np.hstack([history_prompt1["semantic_prompt"], history_prompt2["semantic_prompt"]]).astype(np.int32) #not int64?
 new_coarse_prompt = np.hstack([history_prompt1["coarse_prompt"], history_prompt2["coarse_prompt"]]).astype(np.int32)
 new_fine_prompt = np.hstack([history_prompt1["fine_prompt"], history_prompt2["fine_prompt"]]).astype(np.int32)

 concatenated_history_prompt = {
 "semantic_prompt": new_semantic_prompt,
 "coarse_prompt": new_coarse_prompt,
 "fine_prompt": new_fine_prompt,
 }

 return concatenated_history_prompt

# this shoudl be equal because the rows are always the same, I think?
def align_and_concat_history_prompts(history_prompt1, history_prompt2):
 # Determine the size along the time dimension for each array in the history prompts
 semantic_time_size = min(history_prompt1["semantic_prompt"].shape[0], history_prompt2["semantic_prompt"].shape[0])
 coarse_time_size = min(history_prompt1["coarse_prompt"].shape[1], history_prompt2["coarse_prompt"].shape[1])
 fine_time_size = min(history_prompt1["fine_prompt"].shape[1], history_prompt2["fine_prompt"].shape[1])

 # Align arrays along the time dimension
 semantic_prompt1 = history_prompt1["semantic_prompt"][-semantic_time_size:]
 semantic_prompt2 = history_prompt2["semantic_prompt"][-semantic_time_size:]
 coarse_prompt1 = history_prompt1["coarse_prompt"][:, -coarse_time_size:]
 coarse_prompt2 = history_prompt2["coarse_prompt"][:, -coarse_time_size:]
 fine_prompt1 = history_prompt1["fine_prompt"][:, -fine_time_size:]
 fine_prompt2 = history_prompt2["fine_prompt"][:, -fine_time_size:]

 # Concatenate each array
 new_semantic_prompt = np.hstack([semantic_prompt1, semantic_prompt2]).astype(np.int32)
 new_coarse_prompt = np.hstack([coarse_prompt1, coarse_prompt2]).astype(np.int32)
 new_fine_prompt = np.hstack([fine_prompt1, fine_prompt2]).astype(np.int32)

 # Create a new history_prompt with concatenated arrays
 concatenated_history_prompt = {
 "semantic_prompt": new_semantic_prompt,
 "coarse_prompt": new_coarse_prompt,
 "fine_prompt": new_fine_prompt,
 }

 return concatenated_history_prompt


def merge_history_prompts(left_history_prompt, right_history_prompt, right_size = 128):
 right_history_prompt = resize_history_prompt(right_history_prompt, tokens=right_size, from_front=False)
 combined_history_prompts = concat_history_prompts(left_history_prompt, right_history_prompt)
 combined_history_prompts = resize_history_prompt(combined_history_prompts, tokens=341, from_front=False)
 return combined_history_prompts


preload_models(text_use_small=False,coarse_use_small=False, fine_use_small=False)

In [54]:
# Or Small with fine large
preload_models(text_use_small=True,coarse_use_small=True, fine_use_small=False, force_reload=True)

In [43]:
charlie_text="""
Have I told you that story about how Charlie Parker became Charlie Parker?
Parker's a young kid, pretty good on the Sax, 
gets up to play at a cutting session, 
and well, he fucks it up. 
""".replace("\n", " ").strip()


sentence_text = "A common mistake that people make when trying to design something completely foolproof is to underestimate the ingenuity of complete fools."
sentence_text = "In the beginning the Universe was created. This has made a lot of people very angry and been widely regarded as a bad move."


testing_seed = 12345
testing_seed = -1
testing_SPEAKER = "en_fiery.npz"

# Simple Long-Form Generation
We split longer text into sentences using `nltk` and generate the sentences one by one.

In [83]:
SPEAKER = testing_SPEAKER

other_SPEAKER = "custom_speakers/classic_robot_tts.npz"

og_history_prompt = load_npz(SPEAKER)
other_history_prompt = load_npz(other_SPEAKER)

previous_segment_token_size = 128

show_history_prompt_size(og_history_prompt, token_samples=3, text=f"{SPEAKER} Original", semantic_back_n=previous_segment_token_size)
show_history_prompt_size(other_history_prompt, token_samples=3, text=f"{other_SPEAKER} Other", semantic_back_n=previous_segment_token_size)

#resized_history_prompt = resize_history_prompt(og_history_prompt, tokens=341, from_front=False)
#show_history_prompt_size(resized_history_prompt, text="Resized")
#compare_history_prompts(og_history_prompt, resized_history_prompt)

#compare_history_prompts(other_history_prompt, og_history_prompt)


#align_and_concat = align_and_concat_history_prompts(og_history_prompt, other_history_prompt)
#concat = concat_history_prompts(og_history_prompt, other_history_prompt)

#show_history_prompt_size(align_and_concat, text="align and concat")
#show_history_prompt_size(concat, text="concat")



#other_history_prompt_resize = resize_history_prompt(other_history_prompt, tokens=previous_segment_token_size, from_front=False)
#og_history_prompt_trimmed = resize_history_prompt(og_history_prompt, tokens=341, from_front=False)

#new_speaker_blend = align_and_concat_history_prompts(og_history_prompt_trimmed, other_history_prompt_resize)

other_history_prompt = resize_history_prompt(other_history_prompt, tokens=64, from_front=False)

show_history_prompt_size(other_history_prompt, token_samples=3, text=f"{other_SPEAKER} Other resize check", semantic_back_n=64)

speaker_blend = merge_history_prompts(og_history_prompt, other_history_prompt, right_size=previous_segment_token_size)

show_history_prompt_size(speaker_blend, token_samples=3, text=f"Base {SPEAKER} with {previous_segment_token_size} tokens from {other_SPEAKER}", semantic_back_n=64)

#show_history_prompt_size(new_speaker_blend, text="align and concat")
#show_history_prompt_size(new_speaker_blend_2, text="concat")
#compare_history_prompts(new_speaker_blend, new_speaker_blend_2, text="compare concat methods")
# we have 256 tokens for semantic and less even for coarse, unless we figure out how to pack the inference space 
# <og_history_prompt, cropped to 341> + <previous_segment_token_size>


en_fiery.npz Original
 en_fiery.npz Original semantic_prompt: (682,)
 Tokens: [147, 6242, 302] ... [6747, 187, 891] <128 from end> [891, 891, 7100] ... [2403, 147, 2009]
 en_fiery.npz Original coarse_prompt: (2, 1025)
 Tokens: [738, 738, 1017] ... [683, 402, 162] <192 from end> [695, 501, 240] ... [717, 121, 121]
 Tokens: [363, 363, 646] ... [761, 53, 809] <192 from end> [831, 345, 559] ... [424, 424, 424]
 en_fiery.npz Original fine_prompt: (8, 1025)

custom_speakers/classic_robot_tts.npz Other
 custom_speakers/classic_robot_tts.npz Other semantic_prompt: (457,)
 Tokens: [10, 10, 1184] ... [429, 41, 38] <128 from end> [3277, 3554, 7822] ... [206, 206, 186]
 custom_speakers/classic_robot_tts.npz Other coarse_prompt: (2, 686)
 Tokens: [699, 699, 753] ... [534, 186, 656] <192 from end> [451, 754, 421] ... [133, 133, 106]
 Tokens: [1002, 1002, 404] ... [16, 846, 890] <192 from end> [478, 345, 276] ... [913, 913, 913]
 custom_speakers/classic_robot_tts.npz Other fine_prompt: (8, 686)

cus

In [76]:
# Speaker Mixing

SPEAKER = testing_SPEAKER

other_SPEAKER = "custom_speakers/classic_robot_tts.npz"

og_history_prompt = load_npz(SPEAKER)
other_history_prompt = load_npz(other_SPEAKER)




show_history_prompt_size(og_history_prompt, text=f"f{SPEAKER} Original")
show_history_prompt_size(other_history_prompt, text=f"{other_SPEAKER} Other")

cell_text_prompt = charlie_text

previous_segment_token_size = 128





speaker_blend = merge_history_prompts(og_history_prompt, other_history_prompt, right_size=previous_segment_token_size)

show_history_prompt_size(speaker_blend, text=f"Base {SPEAKER} with {previous_segment_token_size} tokens from {other_SPEAKER}")

"""
other_history_prompt_resize = resize_history_prompt(other_history_prompt, tokens=previous_segment_token_size, from_front=False)
og_history_prompt_trimmed = resize_history_prompt(og_history_prompt, tokens=341, from_front=False)


speaker_blend = concat_history_prompts(og_history_prompt_trimmed, other_history_prompt_resize)
show_history_prompt_size(speaker_blend, text=f"Base {SPEAKER} with {previous_segment_token_size} tokens from {other_SPEAKER}")
"""

all_history_prompts = []

all_history_prompts.append([og_history_prompt, f"{SPEAKER} Original"])
all_history_prompts.append([other_history_prompt, f"{other_SPEAKER } Other"])
all_history_prompts.append([speaker_blend, f"Orig {SPEAKER} + {previous_segment_token_size} tokens from {other_SPEAKER}"])


final_audio_clips = []


for history_prompt, text in all_history_prompts:
 print(f"\n-->Generating for {text}")

 pieces = []

 show_history_prompt_size(history_prompt, token_samples=3, text=text)

 with measure_time(text=" Regular Generation"):

 set_seed(testing_seed)
 full_generation, audio_array = generate_audio(cell_text_prompt, history_prompt=history_prompt, output_full=True, silent=True)
 pieces += [audio_array]
 
 show_history_prompt_size(full_generation, text=f" {text} full_generation output")

 final_audio_clips.append(pieces)



"""
show_history_prompt_size(speaker_blend, token_samples=3, text=f"{other_SPEAKER}")

pieces = []

show_history_prompt_size(og_history_prompt, token_samples=3, text="Other history_prompt file")




sure_time(text="Regular Other"):

 set_seed(testing_seed)
 og_full_generation, audio_array = generate_audio(cell_text_prompt, history_prompt=og_history_prompt, output_full=True)
 pieces += [audio_array]
 
show_history_prompt_size(og_full_generation, text="Regular Other")

final_audio_clips.append(pieces)



show_history_prompt_size(speaker_blend, token_samples=3, text=f"speaker_blend")

pieces = []

with measure_time(text=f"new_speaker_blend"):
 set_seed(testing_seed)
 new_speaker_blend_output, audio_array = generate_audio(cell_text_prompt, history_prompt=speaker_blend, output_full=True)
 pieces += [audio_array]
 
show_history_prompt_size(new_speaker_blend_output, text=f"speaker_blend Output")

final_audio_clips.append(pieces)


#compare_history_prompts(og_full_generation, new_speaker_blend_2)
"""
 







fen_fiery.npz Original
 fen_fiery.npz Original semantic_prompt: (682,)
 Tokens: [147, 6242, 302] ... [10, 230, 56] <256 from end> [206, 10, 206] ... [2403, 147, 2009]
 fen_fiery.npz Original coarse_prompt: (2, 1025)
 Tokens: [738, 738, 1017] ... [738, 738, 738] <384 from end> [738, 738, 738] ... [717, 121, 121]
 Tokens: [363, 363, 646] ... [937, 544, 937] <384 from end> [544, 544, 544] ... [424, 424, 424]
 fen_fiery.npz Original fine_prompt: (8, 1025)

custom_speakers/classic_robot_tts.npz Other
 custom_speakers/classic_robot_tts.npz Other semantic_prompt: (457,)
 Tokens: [10, 10, 1184] ... [41, 41, 2362] <256 from end> [2362, 8414, 7892] ... [206, 206, 186]
 custom_speakers/classic_robot_tts.npz Other coarse_prompt: (2, 686)
 Tokens: [699, 699, 753] ... [118, 937, 51] <384 from end> [378, 820, 937] ... [133, 133, 106]
 Tokens: [1002, 1002, 404] ... [584, 406, 457] <384 from end> [850, 60, 588] ... [913, 913, 913]
 custom_speakers/classic_robot_tts.npz Other fine_prompt: (8, 686)

Bas

'\nshow_history_prompt_size(speaker_blend, token_samples=3, text=f"{other_SPEAKER}")\n\npieces = []\n\nshow_history_prompt_size(og_history_prompt, token_samples=3, text="Other history_prompt file")\n\nwith measure_time(text="Regular Other"):\n\n set_seed(testing_seed)\n og_full_generation, audio_array = generate_audio(cell_text_prompt, history_prompt=og_history_prompt, output_full=True)\n pieces += [audio_array]\n \nshow_history_prompt_size(og_full_generation, text="Regular Other")\n\nfinal_audio_clips.append(pieces)\n\n\n\nshow_history_prompt_size(speaker_blend, token_samples=3, text=f"speaker_blend")\n\npieces = []\n\nwith measure_time(text=f"new_speaker_blend"):\n set_seed(testing_seed)\n new_speaker_blend_output, audio_array = generate_audio(cell_text_prompt, history_prompt=speaker_blend, output_full=True)\n pieces += [audio_array]\n \nshow_history_prompt_size(new_speaker_blend_output, text=f"speaker_blend Output")\n\nfinal_audio_clips.append(pieces)\n\n\n#compare_history_prompts(o

In [9]:
# Speaker Segmenting

npz_directory = "atten/"

print(f"Rendering samples for speakers in: {npz_directory}")
npz_files = [f for f in os.listdir(npz_directory) if f.endswith(".npz")]



new_directory = os.path.join(npz_directory, "vars")
# make new directory
if not os.path.exists(new_directory):
 os.makedirs(new_directory)
 
for i, npz_file in enumerate(npz_files):

 npz_filepath = os.path.join(npz_directory, npz_file)


 
 start_size = 128
 increment_size = 64

 print(f"Loading {npz_filepath}")
 history_prompt = load_npz(npz_filepath)

 semantic_prompt_max = history_prompt["semantic_prompt"].shape[0]
 print(f"semantic_prompt_max: {semantic_prompt_max}")
 
 show_history_prompt_size(history_prompt, token_samples=3, text=f"{npz_file} original")
 # Increase start_size by increment_size until we reach semantic_prompt_max, save each file
 while start_size <= semantic_prompt_max - increment_size:




 # from the front
 new_history_prompt = resize_history_prompt(history_prompt, tokens=start_size, from_front=True)
 show_history_prompt_size(new_history_prompt, token_samples=3, text=f"{npz_file} resized from front")

 
 new_filename = f"{npz_file[:-4]}_front_{start_size}.npz"
 new_filepath = os.path.join(new_directory, new_filename)
 

 print(f"Saving {new_filepath}")
 
 new_history_prompt = resize_history_prompt(new_history_prompt, tokens=341, from_front=False)
 save_as_prompt(new_filepath,new_history_prompt)

 # from the back
 new_history_prompt = resize_history_prompt(history_prompt, tokens=start_size, from_front=False)
 show_history_prompt_size(new_history_prompt, token_samples=3, text=f"{npz_file} resized from back")
 new_filename = f"{npz_file[:-4]}_back_{start_size}.npz"
 
 
 new_filepath = os.path.join(new_directory, new_filename)
 print(f"Saving {new_filepath}")
 new_history_prompt = resize_history_prompt(new_history_prompt, tokens=341, from_front=False)
 save_as_prompt(new_filepath,new_history_prompt)

 start_size += increment_size



 



Rendering samples for speakers in: needs_fixing/
Loading needs_fixing/Hey_have_you_he-23-0512-0130-34-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_initial_prompt.npz
semantic_prompt_max: 256

Hey_have_you_he-23-0512-0130-34-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_initial_prompt.npz original
 Hey_have_you_he-23-0512-0130-34-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_initial_prompt.npz original semantic_prompt: (256,)
 Tokens: [1866, 1424, 1424] ... [648, 198, 41]
 Hey_have_you_he-23-0512-0130-34-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_initial_prompt.npz original coarse_prompt: (2, 384)
 Tokens: [679, 747, 11] ... [347, 976, 865]
 Tokens: [712, 317, 368] ... [839, 812, 544]
 Hey_have_you_he-23-0512-0130-34-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_initial_prompt.npz original fine_prompt: (8, 384)

Hey_have_you_he-23-0512-0130-34-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_initial_prompt.npz resized from front
 Hey_have_you_he-23-0512-01

In [2]:
# Trim files

npz_directory = "sendit/"

print(f"Rendering samples for speakers in: {npz_directory}")
npz_files = [f for f in os.listdir(npz_directory) if f.endswith(".npz")]



new_directory = os.path.join(npz_directory, "trimmed")
# make new directory
if not os.path.exists(new_directory):
 os.makedirs(new_directory)
 
for i, npz_file in enumerate(npz_files):
 
 npz_filepath = os.path.join(npz_directory, npz_file)




 print(f"Loading {npz_filepath}")
 history_prompt = load_npz(npz_filepath)

 new_filename = f"french_female_{i}_trimmed.npz"
 new_filepath = os.path.join(new_directory, new_filename)
 print(f"Saving {new_filepath}")
 history_prompt = resize_history_prompt(history_prompt, tokens=341, from_front=False)
 save_as_prompt(new_filepath,history_prompt)




 



Rendering samples for speakers in: sendit/
Loading sendit/female_reader_neutral_2.npz
Saving sendit/trimmed/french_female_0_trimmed.npz
Loading sendit/french_female_4b.npz
Saving sendit/trimmed/french_female_1_trimmed.npz
Loading sendit/034_fr_dialog.mp4.npz
Saving sendit/trimmed/french_female_2_trimmed.npz
Loading sendit/081_bark_fr_woman_chanson.mp4.npz
Saving sendit/trimmed/french_female_3_trimmed.npz
Loading sendit/french_female_1.npz
Saving sendit/trimmed/french_female_4_trimmed.npz
Loading sendit/062_bark_fr_woman_chanson.mp4.npz
Saving sendit/trimmed/french_female_5_trimmed.npz
Loading sendit/063_bark_fr_woman_chanson.mp4.npz
Saving sendit/trimmed/french_female_6_trimmed.npz
Loading sendit/female_neutral_reader_1.npz
Saving sendit/trimmed/french_female_7_trimmed.npz
Loading sendit/080_bark_fr_woman_chanson.mp4.npz
Saving sendit/trimmed/french_female_8_trimmed.npz
Loading sendit/female_french.npz
Saving sendit/trimmed/french_female_9_trimmed.npz
Loading sendit/french_female_3a.np

In [None]:
# Speaker Segmenting

npz_directory = "Trump/"

print(f"Rendering samples for speakers in: {npz_directory}")
npz_files = [f for f in os.listdir(npz_directory) if f.endswith(".npz")]




SPEAKER = testing_SPEAKER

other_SPEAKER = "custom_speakers/classic_robot_tts.npz"

og_history_prompt = load_npz(SPEAKER)
other_history_prompt = load_npz(other_SPEAKER)




show_history_prompt_size(og_history_prompt, text=f"f{SPEAKER} Original")
show_history_prompt_size(other_history_prompt, text=f"{other_SPEAKER} Other")

cell_text_prompt = charlie_text

previous_segment_token_size = 128





speaker_blend = merge_history_prompts(og_history_prompt, other_history_prompt, right_size=previous_segment_token_size)

show_history_prompt_size(speaker_blend, text=f"Base {SPEAKER} with {previous_segment_token_size} tokens from {other_SPEAKER}")

"""
other_history_prompt_resize = resize_history_prompt(other_history_prompt, tokens=previous_segment_token_size, from_front=False)
og_history_prompt_trimmed = resize_history_prompt(og_history_prompt, tokens=341, from_front=False)


speaker_blend = concat_history_prompts(og_history_prompt_trimmed, other_history_prompt_resize)
show_history_prompt_size(speaker_blend, text=f"Base {SPEAKER} with {previous_segment_token_size} tokens from {other_SPEAKER}")
"""

all_history_prompts = []

all_history_prompts.append([og_history_prompt, f"{SPEAKER} Original"])
all_history_prompts.append([other_history_prompt, f"{other_SPEAKER } Other"])
all_history_prompts.append([speaker_blend, f"Orig {SPEAKER} + {previous_segment_token_size} tokens from {other_SPEAKER}"])


final_audio_clips = []


for history_prompt, text in all_history_prompts:
 print(f"\n-->Generating for {text}")

 pieces = []

 show_history_prompt_size(history_prompt, token_samples=3, text=text)

 with measure_time(text=" Regular Generation"):

 set_seed(testing_seed)
 full_generation, audio_array = generate_audio(cell_text_prompt, history_prompt=history_prompt, output_full=True, silent=True)
 pieces += [audio_array]
 
 show_history_prompt_size(full_generation, text=f" {text} full_generation output")

 final_audio_clips.append(pieces)



"""
show_history_prompt_size(speaker_blend, token_samples=3, text=f"{other_SPEAKER}")

pieces = []

show_history_prompt_size(og_history_prompt, token_samples=3, text="Other history_prompt file")




sure_time(text="Regular Other"):

 set_seed(testing_seed)
 og_full_generation, audio_array = generate_audio(cell_text_prompt, history_prompt=og_history_prompt, output_full=True)
 pieces += [audio_array]
 
show_history_prompt_size(og_full_generation, text="Regular Other")

final_audio_clips.append(pieces)



show_history_prompt_size(speaker_blend, token_samples=3, text=f"speaker_blend")

pieces = []

with measure_time(text=f"new_speaker_blend"):
 set_seed(testing_seed)
 new_speaker_blend_output, audio_array = generate_audio(cell_text_prompt, history_prompt=speaker_blend, output_full=True)
 pieces += [audio_array]
 
show_history_prompt_size(new_speaker_blend_output, text=f"speaker_blend Output")

final_audio_clips.append(pieces)


#compare_history_prompts(og_full_generation, new_speaker_blend_2)
"""
 






In [None]:
# Let's check 256 semantic tokens to exact match the original history prompt, and 341 to match the fine prompt too.
# And the coarse uses only like 209 of the semantic. So low! I hope we can pack the inference token space with more history!

SPEAKER = testing_SPEAKER
cell_text_prompt = charlie_text




og_history_prompt = load_npz(SPEAKER)

show_history_prompt_size(og_history_prompt, token_samples=3, text="Original history_prompt file")


final_audio_clips = []


og_full_generation = None
pieces = []


with measure_time(text="Regular Generation"):

 set_seed(testing_seed)
 og_full_generation, audio_array = generate_audio(cell_text_prompt, history_prompt=og_history_prompt, output_full=True)
 pieces += [audio_array]
 
show_history_prompt_size(og_full_generation, text="Regular Output")

final_audio_clips.append(pieces)



resized_to = 256 # this will give identical sem and coarse. 
resized_to = 341 # this will give identical fine too

resized_history_prompt = resize_history_prompt(og_history_prompt, tokens=resized_to, from_front=False)
show_history_prompt_size(resized_history_prompt, token_samples=3, text=f"Resized to {resized_to}")

pieces = []

with measure_time(text=f"Resized to {resized_to}"):
 set_seed(testing_seed)
 resized_256_full_generation, audio_array = generate_audio(cell_text_prompt, history_prompt=resized_history_prompt, output_full=True)
 pieces += [audio_array]
 
show_history_prompt_size(resized_256_full_generation, text=f"Output after resize: {resized_to}")

final_audio_clips.append(pieces)


compare_history_prompts(og_full_generation, resized_256_full_generation)

 

resized_to = 255
resized_to = 340 
resized_history_prompt_too_small = resize_history_prompt(og_history_prompt, tokens=resized_to, from_front=False)
show_history_prompt_size(resized_history_prompt_too_small, token_samples=3, text=f"Resized to {resized_to}")

pieces = []
with measure_time(text=f"Resized to {resized_to}"):
 set_seed(testing_seed)
 resized_too_small_full_generation, audio_array = generate_audio(cell_text_prompt, history_prompt=resized_history_prompt_too_small, output_full=True)
 pieces += [audio_array]
 
show_history_prompt_size(resized_too_small_full_generation, text=f"Output after {resized_to}")

compare_history_prompts(og_full_generation, resized_too_small_full_generation)


final_audio_clips.append(pieces)







In [77]:
Audio(np.concatenate(final_audio_clips[0]), rate=SAMPLE_RATE)
# original

In [78]:
Audio(np.concatenate(final_audio_clips[1]), rate=SAMPLE_RATE)
# 256

In [79]:
Audio(np.concatenate(final_audio_clips[2]), rate=SAMPLE_RATE)
# 255

# $ \\ $

# Advanced Long-Form Generation
Somtimes Bark will hallucinate a little extra audio at the end of the prompt.
We can solve this issue by lowering the threshold for bark to stop generating text. 
We use the `min_eos_p` kwarg in `generate_text_semantic`

In [60]:
# unmodified naive chunking code. Just generate small audio fragment as clips, just like the original version did with sentences.

# result: terrible.
# if you ask generate_text_semantic just to generate 3 words, it's still assuming that's a normal sized audio clip
# it sounds like a 3 word complete spoken utterance, not a part of a sentence.

# REMINDER TO TEST LATER: maybe we can preload generate_text_semantic with already inferenced tokens, using the tokens in that space, instead of puttitng them in the history_prompt
# if we do this will the words follow naturally, and the result match an inference where we given the whole sentence at once?

GEN_TEMP = 0.6


SPEAKER = testing_SPEAKER
cell_text_prompt = charlie_text

silence = np.zeros(int(0.25 * SAMPLE_RATE)) # quarter second of silence

set_seed(testing_seed)

pieces = split_by_words(cell_text_prompt, 3)
print(pieces)
final_pieces = [] 

for i, piece in enumerate(pieces):
 with measure_time(text="Piece", index=i):
 print(piece)
 semantic_tokens = generate_text_semantic(
 piece,
 history_prompt=SPEAKER,
 temp=GEN_TEMP,
 min_eos_p=0.05, # this controls how likely the generation is to end
 )

 audio_array = semantic_to_waveform(semantic_tokens, history_prompt=SPEAKER)
 final_pieces.append(audio_array)

Audio(np.concatenate(final_pieces), rate=SAMPLE_RATE)

Enabling deterministic algorithms
Set seed to 1234
['Have I told', 'you that story', 'about how Charlie', 'Parker became Charlie', "Parker? Parker's a", 'young kid, pretty', 'good on the', 'Sax, gets up', 'to play at', 'a cutting session,', 'and well, he', 'fucks it up.']
Have I told


100%|██████████| 100/100 [00:00<00:00, 193.68it/s]


actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626


100%|██████████| 2/2 [00:01<00:00, 1.74it/s]


 -->Piece 0 Finished at: 2023-05-09 18:04:15 in 3.2909646034240723 seconds
you that story


100%|██████████| 100/100 [00:00<00:00, 187.81it/s]


actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626


100%|██████████| 3/3 [00:01<00:00, 1.95it/s]


 -->Piece 1 Finished at: 2023-05-09 18:04:19 in 3.8052828311920166 seconds
about how Charlie


100%|██████████| 100/100 [00:00<00:00, 197.72it/s]


actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626


100%|██████████| 2/2 [00:01<00:00, 1.43it/s]


 -->Piece 2 Finished at: 2023-05-09 18:04:22 in 3.587956666946411 seconds
Parker became Charlie


100%|██████████| 100/100 [00:00<00:00, 106.82it/s]


actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626


100%|██████████| 5/5 [00:02<00:00, 1.87it/s]


 -->Piece 3 Finished at: 2023-05-09 18:04:28 in 5.316859245300293 seconds
Parker? Parker's a


100%|██████████| 100/100 [00:00<00:00, 101.79it/s]


actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626


100%|██████████| 5/5 [00:02<00:00, 1.74it/s]


 -->Piece 4 Finished at: 2023-05-09 18:04:33 in 5.499145269393921 seconds
young kid, pretty


100%|██████████| 100/100 [00:01<00:00, 97.89it/s]


actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626


100%|██████████| 5/5 [00:03<00:00, 1.66it/s]


 -->Piece 5 Finished at: 2023-05-09 18:04:39 in 5.751179456710815 seconds
good on the


100%|██████████| 100/100 [00:01<00:00, 70.66it/s] 


actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626


100%|██████████| 7/7 [00:04<00:00, 1.69it/s]


 -->Piece 6 Finished at: 2023-05-09 18:04:46 in 7.167700290679932 seconds
Sax, gets up


100%|██████████| 100/100 [00:01<00:00, 66.56it/s]


actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626


100%|██████████| 8/8 [00:04<00:00, 1.78it/s]


 -->Piece 7 Finished at: 2023-05-09 18:04:54 in 7.699352979660034 seconds
to play at


100%|██████████| 100/100 [00:00<00:00, 185.02it/s]


actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626


100%|██████████| 3/3 [00:01<00:00, 1.94it/s]


 -->Piece 8 Finished at: 2023-05-09 18:04:57 in 3.6623594760894775 seconds
a cutting session,


100%|██████████| 100/100 [00:00<00:00, 124.79it/s]


actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626


100%|██████████| 4/4 [00:02<00:00, 1.69it/s]


 -->Piece 9 Finished at: 2023-05-09 18:05:02 in 4.857580661773682 seconds
and well, he


100%|██████████| 100/100 [00:00<00:00, 185.49it/s]


actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626


100%|██████████| 3/3 [00:01<00:00, 1.87it/s]


 -->Piece 10 Finished at: 2023-05-09 18:05:06 in 3.7911927700042725 seconds
fucks it up.


100%|██████████| 100/100 [00:01<00:00, 68.06it/s]


actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626


100%|██████████| 7/7 [00:04<00:00, 1.58it/s]


 -->Piece 11 Finished at: 2023-05-09 18:05:14 in 7.664571523666382 seconds


In [64]:
# Test: First generate all semantic tokens in one go. Then iteratively chop up the semantic tokens into pieces and feed to the coarse and fine models tiny chunks.

# Result. Kind of okay with large models. 
# Small models ok too except the small fine model, which clips like crazy. 
# You an still kind of feel the structure of the 3 word phrases, even though we generated semantic all at ounce. Though I'm not sure.
# Update nah, it's good, I'm just hearing the coarse and fine history prompt changes.
# There is still some clipping. We're splitting on random numbers, we could instead backtrack and erase spaces, they seem easy to recognize, or try to split on actual pauses or silence, rather than between words.

# The other big flaw is we didn't both updating the history prompt for coarse and semantic for each chunk. Let's try that next.

GEN_TEMP = 0.6
SPEAKER = testing_SPEAKER

silence = np.zeros(int(0.25 * SAMPLE_RATE)) # quarter second of silence

set_seed(testing_seed)

cell_text_prompt = charlie_text

number_of_semantic_pieces = 12

final_pieces = [] 

full_text = charlie_text
print(full_text)

semantic_tokens = []

with measure_time(text="Semantic"):
 semantic_tokens = generate_text_semantic(
 full_text,
 history_prompt=SPEAKER,
 temp=GEN_TEMP,
 min_eos_p=0.05,
 silent=True)



print (f" full len: {len(semantic_tokens)}")

split_semantic_tokens = split_array_equally(semantic_tokens, number_of_semantic_pieces)

with measure_time(text="Coarse Full"):
 for i, coarse_semantic_tokens in enumerate(split_semantic_tokens):
 print(f"length of coarse_semantic_tokens {i + 1}: {len(coarse_semantic_tokens)}")
 audio_array = semantic_to_waveform(coarse_semantic_tokens, history_prompt=SPEAKER, silent=True)
 final_pieces += [audio_array]






Audio(np.concatenate(final_pieces), rate=SAMPLE_RATE)

Enabling deterministic algorithms
Set seed to 1234
Have I told you that story about how Charlie Parker became Charlie Parker? Parker's a young kid, pretty good on the Sax, gets up to play at a cutting session, and well, he fucks it up.
 -->Semantic Finished at: 2023-05-09 18:13:41 in 6.372102499008179 seconds
 full len: 591
length of coarse_semantic_tokens 1: 49
actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626
length of coarse_semantic_tokens 2: 49
actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626
length of coarse_semantic_tokens 3: 49
actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626
length of coarse_semantic_tokens 4: 50
actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626
length of coarse_semantic_tokens 5: 49
actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626
length of coarse_semantic_tokens 6: 49
actual lengths we're using, x_semantic_history: 209 x_coarse_history: 

In [65]:
# let's double check the 3 word structure phrasing is ACTUALLY result of the coarse model, by using the raw semantic tokens that we generated in the first cell.
# Just want to double check
# Edit: it wasn't the coarse model. 


show_history_prompt_size(og_full_generation, text="base semantic output")

semantic_tokens = og_full_generation["semantic_prompt"]

set_seed(testing_seed)

final_pieces = []

split_semantic_tokens = split_array_equally(semantic_tokens, 8)


with measure_time(text="Coarse Full"):
 for i, coarse_semantic_tokens in enumerate(split_semantic_tokens):
 print(f"length of coarse_semantic_tokens {i + 1}: {len(coarse_semantic_tokens)}")
 audio_array = semantic_to_waveform(coarse_semantic_tokens, history_prompt=SPEAKER, silent=True)
 final_pieces += [audio_array]


Audio(np.concatenate(final_pieces), rate=SAMPLE_RATE)


base semantic output semantic_prompt: (548,)
 Tokens: [8735, 8385, 147] ... [232, 232, 10]

base semantic output coarse_prompt: (2, 823)
 Tokens: [62, 62, 62] ... [855, 855, 855]
 Tokens: [424, 424, 424] ... [928, 913, 913]

base semantic output fine_prompt: (8, 823)
Enabling deterministic algorithms
Set seed to 1234
length of coarse_semantic_tokens 1: 68
actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626
length of coarse_semantic_tokens 2: 69
actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626
length of coarse_semantic_tokens 3: 68
actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626
length of coarse_semantic_tokens 4: 69
actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626
length of coarse_semantic_tokens 5: 68
actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626
length of coarse_semantic_tokens 6: 69
actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626
le

In [80]:
# Test: Generate all semantic in one go. Then iteratively chop up the semantic tokens into tiny pieces and feed to the coarse and fine models.
# But this time properly also update the history prompt with the coarse and fine tokens with prev segments

# this is nearly perfect, down to some pretty small semantic chunks, down as small as almost 50 coarse tokens nearly, and 80+ is usually mostly seamless with a few minor artifacts
# I wonder if the time alignment hack is breaking the smaller chunks because 1 second chunks work so well, but not smaller?

# Update: constantly updating pushes origina speaker out of the window too fast, and the short segments drift about as much as a full 14 second segment, so it loses the speaker voice too vase. Next cell, fix that.

SPEAKER = testing_SPEAKER
cell_text_prompt = charlie_text
set_seed(testing_seed)


GEN_TEMP = 0.6


final_pieces = [] 

full_text = cell_text_prompt

print(full_text)


next_history_prompt_for_coarse = load_npz(SPEAKER)

show_history_prompt_size(next_history_prompt_for_coarse,text="original history_prompt")

semantic_tokens_to_process = generate_text_semantic(
 full_text,
 history_prompt=SPEAKER,
 temp=GEN_TEMP,
 min_eos_p=0.05,
 silent=True)


split_semantic_tokens = split_array_equally(semantic_tokens_to_process, 12)

for i, coarse_semantic_tokens in enumerate(split_semantic_tokens):
 print(f"processing semantic_tokens chunk {i + 1} of size: {len(coarse_semantic_tokens)}")

 
 # first time the history prompt is same as regular speaker file
 full_generation, audio_array = semantic_to_waveform(coarse_semantic_tokens, history_prompt=next_history_prompt_for_coarse, output_full=True)

 coarse_tokens = generate_coarse(
 coarse_semantic_tokens,
 history_prompt=next_history_prompt_for_coarse,
 temp=0.7,
 silent=True,
 use_kv_caching=True,
 x_coarse_history_alignment_hack=-2
 )
 fine_tokens = generate_fine(
 coarse_tokens,
 history_prompt=next_history_prompt_for_coarse,
 temp=0.5,
 )
 audio_array = codec_decode(fine_tokens)

 full_generation = {
 "semantic_prompt": coarse_semantic_tokens,
 "coarse_prompt": coarse_tokens,
 "fine_prompt": fine_tokens,
 }



 show_history_prompt_size(full_generation, text="full generation returned") 


 #stack history with last generated
 next_semantic_tokens = np.hstack([next_history_prompt_for_coarse["semantic_prompt"], full_generation["semantic_prompt"]]).astype(np.int32) #should this be int64?

 next_coarse_tokens = np.hstack([next_history_prompt_for_coarse["coarse_prompt"], full_generation["coarse_prompt"]]).astype(np.int32)

 next_fine_tokens = np.hstack([next_history_prompt_for_coarse["fine_prompt"], full_generation["fine_prompt"]]).astype(np.int32)

 next_history_prompt_for_coarse = {
 "semantic_prompt": next_semantic_tokens,
 "coarse_prompt": next_coarse_tokens,
 "fine_prompt": next_fine_tokens,
 }

 show_history_prompt_size(next_history_prompt_for_coarse, text="next history prompt for coarse")

 final_pieces.append(audio_array)


Audio(np.concatenate(final_pieces), rate=SAMPLE_RATE)

Disabling deterministic algorithms
Set seed to 3004692535
Have I told you that story about how Charlie Parker became Charlie Parker? Parker's a young kid, pretty good on the Sax, gets up to play at a cutting session, and well, he fucks it up.

original history_prompt
 original history_prompt semantic_prompt: (682,)
 Tokens: [147, 6242, 302] ... [10, 230, 56] <256 from end> [206, 10, 206] ... [2403, 147, 2009]
 original history_prompt coarse_prompt: (2, 1025)
 Tokens: [738, 738, 1017] ... [738, 738, 738] <384 from end> [738, 738, 738] ... [717, 121, 121]
 Tokens: [363, 363, 646] ... [937, 544, 937] <384 from end> [544, 544, 544] ... [424, 424, 424]
 original history_prompt fine_prompt: (8, 1025)
processing semantic_tokens chunk 1 of size: 53
actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626


100%|██████████| 3/3 [00:01<00:00, 1.86it/s]


actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626

full generation returned
 full generation returned semantic_prompt: (53,)
 Tokens: [2305, 147, 3208] ... [720, 1409, 1409]
 full generation returned coarse_prompt: (2, 79)
 Tokens: [62, 62, 62] ... [936, 958, 505]
 Tokens: [424, 424, 424] ... [632, 654, 140]
 full generation returned fine_prompt: (8, 79)

next history prompt for coarse
 next history prompt for coarse semantic_prompt: (735,)
 Tokens: [147, 6242, 302] ... [6025, 6025, 6564] <256 from end> [648, 41, 6286] ... [720, 1409, 1409]
 next history prompt for coarse coarse_prompt: (2, 1104)
 Tokens: [738, 738, 1017] ... [30, 370, 860] <384 from end> [208, 495, 20] ... [936, 958, 505]
 Tokens: [363, 363, 646] ... [701, 279, 719] <384 from end> [416, 673, 568] ... [632, 654, 140]
 next history prompt for coarse fine_prompt: (8, 1104)
processing semantic_tokens chunk 2 of size: 54
actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626


100%|██████████| 3/3 [00:01<00:00, 1.76it/s]


actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626

full generation returned
 full generation returned semantic_prompt: (54,)
 Tokens: [3732, 6358, 808] ... [2069, 9848, 1044]
 full generation returned coarse_prompt: (2, 81)
 Tokens: [921, 928, 264] ... [20, 192, 56]
 Tokens: [772, 1002, 496] ... [836, 633, 994]
 full generation returned fine_prompt: (8, 81)

next history prompt for coarse
 next history prompt for coarse semantic_prompt: (789,)
 Tokens: [147, 6242, 302] ... [326, 326, 1376] <256 from end> [211, 211, 211] ... [2069, 9848, 1044]
 next history prompt for coarse coarse_prompt: (2, 1185)
 Tokens: [738, 738, 1017] ... [583, 583, 491] <384 from end> [136, 321, 136] ... [20, 192, 56]
 Tokens: [363, 363, 646] ... [414, 960, 674] <384 from end> [564, 693, 700] ... [836, 633, 994]
 next history prompt for coarse fine_prompt: (8, 1185)
processing semantic_tokens chunk 3 of size: 53
actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626


100%|██████████| 3/3 [00:01<00:00, 1.83it/s]


actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626

full generation returned
 full generation returned semantic_prompt: (53,)
 Tokens: [50, 27, 27] ... [1044, 118, 27]
 full generation returned coarse_prompt: (2, 79)
 Tokens: [393, 52, 257] ... [393, 91, 738]
 Tokens: [404, 700, 700] ... [947, 665, 859]
 full generation returned fine_prompt: (8, 79)

next history prompt for coarse
 next history prompt for coarse semantic_prompt: (842,)
 Tokens: [147, 6242, 302] ... [230, 206, 56] <256 from end> [193, 193, 56] ... [1044, 118, 27]
 next history prompt for coarse coarse_prompt: (2, 1264)
 Tokens: [738, 738, 1017] ... [408, 408, 408] <384 from end> [408, 408, 408] ... [393, 91, 738]
 Tokens: [363, 363, 646] ... [518, 518, 518] <384 from end> [518, 518, 518] ... [947, 665, 859]
 next history prompt for coarse fine_prompt: (8, 1264)
processing semantic_tokens chunk 4 of size: 54
actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626


100%|██████████| 3/3 [00:01<00:00, 1.79it/s]


actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626

full generation returned
 full generation returned semantic_prompt: (54,)
 Tokens: [27, 27, 1232] ... [206, 2009, 206]
 full generation returned coarse_prompt: (2, 81)
 Tokens: [738, 1017, 106] ... [408, 408, 121]
 Tokens: [859, 928, 969] ... [518, 518, 424]
 full generation returned fine_prompt: (8, 81)

next history prompt for coarse
 next history prompt for coarse semantic_prompt: (896,)
 Tokens: [147, 6242, 302] ... [17, 2113, 3745] <256 from end> [5218, 117, 107] ... [206, 2009, 206]
 next history prompt for coarse coarse_prompt: (2, 1345)
 Tokens: [738, 738, 1017] ... [613, 185, 291] <384 from end> [565, 879, 228] ... [408, 408, 121]
 Tokens: [363, 363, 646] ... [453, 198, 298] <384 from end> [809, 516, 687] ... [518, 518, 424]
 next history prompt for coarse fine_prompt: (8, 1345)
processing semantic_tokens chunk 5 of size: 53
actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626


100%|██████████| 3/3 [00:01<00:00, 1.86it/s]


actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626

full generation returned
 full generation returned semantic_prompt: (53,)
 Tokens: [2009, 206, 528] ... [479, 210, 50]
 full generation returned coarse_prompt: (2, 79)
 Tokens: [408, 408, 408] ... [751, 530, 1010]
 Tokens: [913, 913, 518] ... [924, 924, 924]
 full generation returned fine_prompt: (8, 79)

next history prompt for coarse
 next history prompt for coarse semantic_prompt: (949,)
 Tokens: [147, 6242, 302] ... [2497, 8029, 9663] <256 from end> [1908, 50, 5369] ... [479, 210, 50]
 next history prompt for coarse coarse_prompt: (2, 1424)
 Tokens: [738, 738, 1017] ... [983, 216, 747] <384 from end> [958, 921, 604] ... [751, 530, 1010]
 Tokens: [363, 363, 646] ... [229, 654, 996] <384 from end> [307, 307, 888] ... [924, 924, 924]
 next history prompt for coarse fine_prompt: (8, 1424)
processing semantic_tokens chunk 6 of size: 54
actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626


100%|██████████| 3/3 [00:01<00:00, 1.79it/s]


actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626

full generation returned
 full generation returned semantic_prompt: (54,)
 Tokens: [10, 27, 27] ... [4040, 4667, 50]
 full generation returned coarse_prompt: (2, 81)
 Tokens: [699, 699, 430] ... [475, 738, 62]
 Tokens: [373, 765, 601] ... [519, 544, 913]
 full generation returned fine_prompt: (8, 81)

next history prompt for coarse
 next history prompt for coarse semantic_prompt: (1003,)
 Tokens: [147, 6242, 302] ... [5740, 5740, 520] <256 from end> [4638, 298, 4571] ... [4040, 4667, 50]
 next history prompt for coarse coarse_prompt: (2, 1505)
 Tokens: [738, 738, 1017] ... [402, 162, 20] <384 from end> [216, 112, 683] ... [475, 738, 62]
 Tokens: [363, 363, 646] ... [757, 45, 668] <384 from end> [836, 872, 754] ... [519, 544, 913]
 next history prompt for coarse fine_prompt: (8, 1505)
processing semantic_tokens chunk 7 of size: 54
actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626


100%|██████████| 3/3 [00:01<00:00, 1.80it/s]


actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626

full generation returned
 full generation returned semantic_prompt: (54,)
 Tokens: [10, 27, 9736] ... [206, 193, 193]
 full generation returned coarse_prompt: (2, 81)
 Tokens: [324, 584, 796] ... [408, 62, 62]
 Tokens: [14, 536, 782] ... [913, 424, 424]
 full generation returned fine_prompt: (8, 81)

next history prompt for coarse
 next history prompt for coarse semantic_prompt: (1057,)
 Tokens: [147, 6242, 302] ... [5238, 3009, 1787] <256 from end> [50, 10, 27] ... [206, 193, 193]
 next history prompt for coarse coarse_prompt: (2, 1586)
 Tokens: [738, 738, 1017] ... [724, 833, 23] <384 from end> [530, 976, 724] ... [408, 62, 62]
 Tokens: [363, 363, 646] ... [942, 516, 42] <384 from end> [446, 570, 888] ... [913, 424, 424]
 next history prompt for coarse fine_prompt: (8, 1586)
processing semantic_tokens chunk 8 of size: 53
actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626


100%|██████████| 3/3 [00:01<00:00, 1.87it/s]


actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626

full generation returned
 full generation returned semantic_prompt: (53,)
 Tokens: [193, 147, 193] ... [210, 50, 10]
 full generation returned coarse_prompt: (2, 79)
 Tokens: [408, 62, 62] ... [855, 855, 855]
 Tokens: [913, 424, 424] ... [913, 913, 913]
 full generation returned fine_prompt: (8, 79)

next history prompt for coarse
 next history prompt for coarse semantic_prompt: (1110,)
 Tokens: [147, 6242, 302] ... [138, 131, 10] <256 from end> [230, 206, 206] ... [210, 50, 10]
 next history prompt for coarse coarse_prompt: (2, 1665)
 Tokens: [738, 738, 1017] ... [604, 408, 408] <384 from end> [106, 106, 106] ... [855, 855, 855]
 Tokens: [363, 363, 646] ... [928, 765, 928] <384 from end> [913, 913, 913] ... [913, 913, 913]
 next history prompt for coarse fine_prompt: (8, 1665)
processing semantic_tokens chunk 9 of size: 54
actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626


100%|██████████| 3/3 [00:01<00:00, 1.82it/s]


actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626

full generation returned
 full generation returned semantic_prompt: (54,)
 Tokens: [10, 27, 4035] ... [41, 255, 255]
 full generation returned coarse_prompt: (2, 81)
 Tokens: [472, 472, 404] ... [604, 724, 62]
 Tokens: [928, 729, 729] ... [516, 114, 841]
 full generation returned fine_prompt: (8, 81)

next history prompt for coarse
 next history prompt for coarse semantic_prompt: (1164,)
 Tokens: [147, 6242, 302] ... [2305, 147, 5008] <256 from end> [41, 399, 8831] ... [41, 255, 255]
 next history prompt for coarse coarse_prompt: (2, 1746)
 Tokens: [738, 738, 1017] ... [408, 408, 408] <384 from end> [408, 62, 408] ... [604, 724, 62]
 Tokens: [363, 363, 646] ... [518, 518, 518] <384 from end> [544, 424, 518] ... [516, 114, 841]
 next history prompt for coarse fine_prompt: (8, 1746)
processing semantic_tokens chunk 10 of size: 53
actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626


100%|██████████| 3/3 [00:01<00:00, 1.84it/s]


actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626

full generation returned
 full generation returned semantic_prompt: (53,)
 Tokens: [255, 321, 41] ... [206, 206, 7567]
 full generation returned coarse_prompt: (2, 79)
 Tokens: [724, 871, 939] ... [408, 738, 62]
 Tokens: [687, 1007, 834] ... [544, 544, 424]
 full generation returned fine_prompt: (8, 79)

next history prompt for coarse
 next history prompt for coarse semantic_prompt: (1217,)
 Tokens: [147, 6242, 302] ... [3066, 5416, 5416] <256 from end> [3995, 3995, 3995] ... [206, 206, 7567]
 next history prompt for coarse coarse_prompt: (2, 1825)
 Tokens: [738, 738, 1017] ... [890, 612, 1021] <384 from end> [645, 1021, 495] ... [408, 738, 62]
 Tokens: [363, 363, 646] ... [174, 458, 570] <384 from end> [446, 446, 772] ... [544, 544, 424]
 next history prompt for coarse fine_prompt: (8, 1825)
processing semantic_tokens chunk 11 of size: 54
actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626

100%|██████████| 3/3 [00:01<00:00, 1.76it/s]


actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626

full generation returned
 full generation returned semantic_prompt: (54,)
 Tokens: [65, 206, 206] ... [64, 17, 17]
 full generation returned coarse_prompt: (2, 81)
 Tokens: [408, 408, 408] ... [74, 378, 59]
 Tokens: [518, 518, 544] ... [685, 685, 132]
 full generation returned fine_prompt: (8, 81)

next history prompt for coarse
 next history prompt for coarse semantic_prompt: (1271,)
 Tokens: [147, 6242, 302] ... [684, 684, 2775] <256 from end> [171, 130, 6326] ... [64, 17, 17]
 next history prompt for coarse coarse_prompt: (2, 1906)
 Tokens: [738, 738, 1017] ... [942, 402, 428] <384 from end> [428, 402, 833] ... [74, 378, 59]
 Tokens: [363, 363, 646] ... [984, 884, 132] <384 from end> [800, 513, 870] ... [685, 685, 132]
 next history prompt for coarse fine_prompt: (8, 1906)
processing semantic_tokens chunk 12 of size: 54
actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626


100%|██████████| 3/3 [00:01<00:00, 1.82it/s]


actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626

full generation returned
 full generation returned semantic_prompt: (54,)
 Tokens: [9241, 7558, 7558] ... [147, 1613, 2009]
 full generation returned coarse_prompt: (2, 81)
 Tokens: [501, 162, 501] ... [408, 408, 62]
 Tokens: [285, 865, 985] ... [518, 518, 424]
 full generation returned fine_prompt: (8, 81)

next history prompt for coarse
 next history prompt for coarse semantic_prompt: (1325,)
 Tokens: [147, 6242, 302] ... [2305, 2305, 2305] <256 from end> [5008, 147, 27] ... [147, 1613, 2009]
 next history prompt for coarse coarse_prompt: (2, 1987)
 Tokens: [738, 738, 1017] ... [62, 408, 62] <384 from end> [62, 25, 465] ... [408, 408, 62]
 Tokens: [363, 363, 646] ... [424, 913, 424] <384 from end> [424, 646, 775] ... [518, 518, 424]
 next history prompt for coarse fine_prompt: (8, 1987)


In [90]:
# Test: Generate all semantic in one go. Then iteratively chop up the semantic tokens into tiny pieces and feed to the coarse and fine models.
# But this time properly also update the history prompt with the coarse and fine tokens with prev segments

# And further, let's allocate a chunk of permanent base history storage

# Results: Basically perfect, still some minor artifacts between sections, but the voice doens't change

# could look into splitting either semantic or coarse on the non speaking sections which seem easy to recognize. 

SPEAKER = testing_SPEAKER
cell_text_prompt = charlie_text
set_seed(testing_seed)


GEN_TEMP = 0.6


final_pieces = [] 

full_text = cell_text_prompt

print(full_text)


og_history_prompt = load_npz(SPEAKER)

next_history_prompt_for_coarse = None
show_history_prompt_size(og_history_prompt,text="original history_prompt")

semantic_tokens_to_process = generate_text_semantic(
 full_text,
 history_prompt=og_history_prompt,
 temp=GEN_TEMP,
 min_eos_p=0.05,
 silent=True)


split_semantic_tokens = split_array_equally(semantic_tokens_to_process, 12)

previous_segment_buffer = 64

for i, coarse_semantic_tokens in enumerate(split_semantic_tokens):
 print(f"processing semantic_tokens chunk {i + 1} of size: {len(coarse_semantic_tokens)}")

 
 if next_history_prompt_for_coarse is None:
 next_history_prompt_for_coarse = copy.deepcopy(og_history_prompt)
 else:
 next_history_prompt_for_coarse = merge_history_prompts(og_history_prompt, next_history_prompt_for_coarse, right_size=previous_segment_buffer)

 show_history_prompt_size(next_history_prompt_for_coarse,text="next history prompt for coarse", semantic_back_n=previous_segment_buffer)

 #full_generation, audio_array = semantic_to_waveform(coarse_semantic_tokens, history_prompt=next_history_prompt_for_coarse, output_full=True)

 coarse_tokens = generate_coarse(
 coarse_semantic_tokens,
 history_prompt=next_history_prompt_for_coarse,
 temp=0.7,
 silent=True,
 use_kv_caching=True,
 x_coarse_history_alignment_hack=-2
 )
 fine_tokens = generate_fine(
 coarse_tokens,
 history_prompt=next_history_prompt_for_coarse,
 temp=0.5,
 )
 audio_array = codec_decode(fine_tokens)

 full_generation = {
 "semantic_prompt": coarse_semantic_tokens,
 "coarse_prompt": coarse_tokens,
 "fine_prompt": fine_tokens,
 }



 show_history_prompt_size(full_generation, text="full generation returned", semantic_back_n=previous_segment_buffer) 



 next_history_prompt_for_coarse = merge_history_prompts(next_history_prompt_for_coarse, full_generation, right_size=256)

 show_history_prompt_size(next_history_prompt_for_coarse, text="next history prompt for coarse + full generation, end of loop")

 final_pieces.append(audio_array)


Audio(np.concatenate(final_pieces), rate=SAMPLE_RATE)

Disabling deterministic algorithms
Set seed to 2973489230
Have I told you that story about how Charlie Parker became Charlie Parker? Parker's a young kid, pretty good on the Sax, gets up to play at a cutting session, and well, he fucks it up.

original history_prompt
 original history_prompt semantic_prompt: (682,)
 Tokens: [147, 6242, 302] ... [10, 230, 56] <256 from end> [206, 10, 206] ... [2403, 147, 2009]
 original history_prompt coarse_prompt: (2, 1025)
 Tokens: [738, 738, 1017] ... [738, 738, 738] <384 from end> [738, 738, 738] ... [717, 121, 121]
 Tokens: [363, 363, 646] ... [937, 544, 937] <384 from end> [544, 544, 544] ... [424, 424, 424]
 original history_prompt fine_prompt: (8, 1025)
processing semantic_tokens chunk 1 of size: 58

next history prompt for coarse
 next history prompt for coarse semantic_prompt: (682,)
 Tokens: [147, 6242, 302] ... [8851, 27, 1041] <64 from end> [59, 28, 107] ... [2403, 147, 2009]
 next history prompt for coarse coarse_prompt: (2, 1025)
 Tokens

In [95]:
# Test: Can we calso split up generate_text_semantic and get coherent and similar results?

# not really. It works but sounds weird. We're gonna have pack the inference space.


SPEAKER = testing_SPEAKER



cell_text_prompt = charlie_text
set_seed(testing_seed)



semantic_tokens_full = []


full_text = cell_text_prompt

print(full_text)
pieces = split_by_words(full_text, 3)
print(pieces)

final_pieces = []

og_speaker_prompt = load_npz(SPEAKER) 
show_history_prompt_size(og_speaker_prompt,text="original history_prompt")


next_segment_history_prompt = None
next_semantic_for_coarse = None

# I think we need more than 256 semantic space here. history won't be good enough. But let's try.
previous_segment_buffer = 64

for i, piece in enumerate(pieces):
 with measure_time(text="Piece", index=i):

 
 if next_segment_history_prompt is None:
 next_segment_history_prompt = copy.deepcopy(og_history_prompt)
 else:
 next_segment_history_prompt = merge_history_prompts(og_history_prompt, next_segment_history_prompt, right_size=previous_segment_buffer)

 show_history_prompt_size(next_segment_history_prompt,text="next_segment_history_prompt", semantic_back_n=previous_segment_buffer)

 semantic_tokens_for_this_piece = generate_text_semantic(
 piece,
 history_prompt=next_segment_history_prompt,
 temp=GEN_TEMP,
 min_eos_p=0.05,
 #top_k = 50,
 #top_p = 0.90,
 silent=True,
 )
 
 semantic_tokens_full.append(semantic_tokens_for_this_piece)

 coarse_tokens_for_this_piece = generate_coarse(
 semantic_tokens_for_this_piece,
 history_prompt=next_segment_history_prompt,
 temp=0.7,
 silent=True,
 use_kv_caching=True,
 x_coarse_history_alignment_hack=-2
 )
 fine_tokens_for_this_piece = generate_fine(
 coarse_tokens_for_this_piece,
 history_prompt=next_segment_history_prompt,
 temp=0.5,
 )
 audio_array = codec_decode(fine_tokens_for_this_piece)

 full_generation = {
 "semantic_prompt": semantic_tokens_for_this_piece,
 "coarse_prompt": coarse_tokens_for_this_piece,
 "fine_prompt": fine_tokens_for_this_piece,
 }

 show_history_prompt_size(full_generation, text="full_generation", semantic_back_n=previous_segment_buffer)


 next_segment_history_prompt = merge_history_prompts(next_segment_history_prompt, full_generation, right_size=1024)

 show_history_prompt_size(next_segment_history_prompt, text="next_segment_history_prompt, full generation, end of loop")

 final_pieces.append(audio_array)



"""


print (f" full len: {len(semantic_tokens_full)}")

#split_semantic_tokens = split_array_equally(semantic_tokens, 8)

#for i, coarse_semantic_tokens in enumerate(split_semantic_tokens):
for i, piece in enumerate(pieces):
 coarse_semantic_tokens = semantic_tokens_full[i]
 print(f"length of coarse_semantic_tokens {i + 1}: {len(coarse_semantic_tokens)}")
 audio_array = semantic_to_waveform(coarse_semantic_tokens, history_prompt=SPEAKER)
 final_pieces += [audio_array, silence.copy()]
"""




Audio(np.concatenate(final_pieces), rate=SAMPLE_RATE)

Disabling deterministic algorithms
Set seed to 364314352
Have I told you that story about how Charlie Parker became Charlie Parker? Parker's a young kid, pretty good on the Sax, gets up to play at a cutting session, and well, he fucks it up.
['Have I told', 'you that story', 'about how Charlie', 'Parker became Charlie', "Parker? Parker's a", 'young kid, pretty', 'good on the', 'Sax, gets up', 'to play at', 'a cutting session,', 'and well, he', 'fucks it up.']

original history_prompt
 original history_prompt semantic_prompt: (682,)
 Tokens: [147, 6242, 302] ... [10, 230, 56] <256 from end> [206, 10, 206] ... [2403, 147, 2009]
 original history_prompt coarse_prompt: (2, 1025)
 Tokens: [738, 738, 1017] ... [738, 738, 738] <384 from end> [738, 738, 738] ... [717, 121, 121]
 Tokens: [363, 363, 646] ... [937, 544, 937] <384 from end> [544, 544, 544] ... [424, 424, 424]
 original history_prompt fine_prompt: (8, 1025)

next_segment_history_prompt
 next_segment_history_prompt semantic_prompt: 

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

def plot_heatmap(data, width_per_100_cols=1, title=None):
 if len(data.shape) == 1:
 data = data[np.newaxis, :]
 
 width = (data.shape[1] / 100) * width_per_100_cols
 fig, ax = plt.subplots(figsize=(width, 5))
 sns.heatmap(data, cmap='coolwarm', ax=ax)
 
 if title:
 plt.title(title)
 
 plt.show()
 
 
x_coarse_history = og_full_generation["coarse_prompt"]

print(x_coarse_history.shape)
#x_coarse_history = _flatten_codebooks(x_coarse_history) + SEMANTIC_VOCAB_SIZE

print(full_generation["coarse_prompt"].shape)

In [None]:
import rich
from rich import print
from rich import pretty
from rich.pretty import pprint
from rich import inspect

SPEAKER = "en_fiery.npz"
history_prompt_npz = np.load(SPEAKER)

semantic_tokens = history_prompt_npz["semantic_prompt"]
coarse_tokens = history_prompt_npz["coarse_prompt"]
fine_tokens = history_prompt_npz["fine_prompt"]

print(f"semantic_tokens shape: {semantic_tokens.shape}")
print(f"coarse_tokens shape: {coarse_tokens.shape}")
print(f"fine_tokens shape: {fine_tokens.shape}")

plot_heatmap(semantic_tokens)
plot_heatmap(coarse_tokens)
plot_heatmap(fine_tokens)

# print shapes



# The blue values seem to be silence. if we chunk up coarse, we should split on those. 
# is is token 424 and 518 mostly? I culd strip those, then resize semantic. That way if we chunk between words were good.
# or, split on those...
inspect(semantic_tokens, title="semantic_tokens")
inspect(coarse_tokens, title="coarse_tokens")
inspect(fine_tokens, title="fine_tokens")

In [None]:
sns.heatmap(next_coarse_tokens, cmap='coolwarm')
plt.title('Coarse Prompt')
plt.show()

In [None]:
og_coarse = og_full_generation["coarse_prompt"]
quick_cat_test = np.hstack([og_coarse,og_coarse]).astype(np.int32)

width_per_100_cols = 1

width1 = (og_coarse.shape[1] / 100) * width_per_100_cols
fig, ax = plt.subplots(figsize=(width1, 5))
sns.heatmap(og_coarse, cmap='coolwarm', ax=ax)
plt.title('Coarse Prompt orig')
plt.show()

width2 = (quick_cat_test.shape[1] / 100) * width_per_100_cols
fig, ax = plt.subplots(figsize=(width2, 5))

sns.heatmap(quick_cat_test, cmap='coolwarm', ax=ax)
plt.title('Coarse Prompt concat with itself')
plt.show()

In [None]:
text = """
Truth is, I don't think people understood what it was I was doing at Schaffer. 
I wasn't there to conduct. 
How many fucken morons can wave his arms and keep people in tempo? 
I was there to push people beyond what's expected of them. 
I believe that is an absolute necessity. 
Otherwise we're depriving the world of the next Louis Armstrong, 
or the next Charlie Parker. 
Have I told you that story about how Charlie Parker became Charlie Parker?
Parker's a young kid, pretty good on the Sax, 
gets up to play at a cutting session, 
and well, he fucks it up. 
And Jones nearly decapitates him for it, throws a cymbal at his head. 
"""

In [None]:
# Test: When generate_text_semantic is given way too many tokens, and the audio is messed up, are we sure it's not the coarse function that is dying when trying to handle the big prompt?
# To figure out, let's give it some bgi text and split the semantic tokens ourselves
# voice speed is largely determined by how many words you generate_text_semantic to represent, though the history_prompt has a huge effect
# because that's how many words THAT was asked to represent

# result, not it's the semantic

# you can see the tokens stop increasing at some point. could be good rule of thumb for chunk decisions

GEN_TEMP = 0.6
SPEAKER = "v2/en_speaker_6"
silence = np.zeros(int(0.25 * SAMPLE_RATE)) # quarter second of silence

# A common mistake that people make when trying to design something completely foolproof is to underestimate the ingenuity of complete fools.
import time

import pandas

set_seed(-1)

this_segment_start_time = time.time()


print(f"Segment Start at: {time.strftime('%Y-%m-%d %H:%M:%S')}")
final_pieces = [] 

semantic_tokens_full = []


pieces = text.strip().split("\n")

last_piece = ''
for i, piece in enumerate(pieces):
 piece = f"{last_piece} {piece}"
 print(f"piece {i}: {piece}")

 semantic_tokens = generate_text_semantic(
 piece,
 history_prompt=SPEAKER,
 temp=GEN_TEMP,
 min_eos_p=0.05, 
 silent=True,
 )
 last_piece = piece
 semantic_token_length = len(semantic_tokens)
 print(f"length of semantic_tokens: {semantic_token_length}")
 

In [None]:
# Test: When generate_text_semantic is given way too many tokens, and the audio is messed up, are we sure it's not the coarse function that is dying when trying to handle the big prompt?
# To figure out, let's give it some bgi text and split the semantic tokens ourselves

GEN_TEMP = 0.6
SPEAKER = "v2/en_speaker_6"
silence = np.zeros(int(0.25 * SAMPLE_RATE)) # quarter second of silence

# A common mistake that people make when trying to design something completely foolproof is to underestimate the ingenuity of complete fools.
import time

import pandas

set_seed(-1)

this_segment_start_time = time.time()


print(f"Segment Start at: {time.strftime('%Y-%m-%d %H:%M:%S')}")
final_pieces = [] 

semantic_tokens_full = []




full_text = text
pieces = text.split("\n")
print(pieces)

last_piece = ''
for i, piece in enumerate(pieces):
 piece += last_piece
 print(f"piece {i}: {piece}")

 semantic_tokens = generate_text_semantic(
 piece,
 history_prompt=SPEAKER,
 temp=GEN_TEMP,
 min_eos_p=0.05, 
 )
 last_piece = piece
 semantic_token_length = len(semantic_tokens)
 print(f"length of semantic_tokens: {semantic_token_length}")
 


semantic_end_time = time.time()
elapsed_time = semantic_end_time - this_segment_start_time

semantic_token_length = len(semantic_tokens)
time_finished = f"{semantic_token_length} semantic_tokens Finished at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(semantic_end_time))}"
time_taken = f"in {elapsed_time} seconds"
print(f" -->{time_finished} {time_taken}")


"""

history_prompt_npz = np.load("bark/assets/prompts/v2/en_speaker_6.npz")

semantic_tokens = history_prompt_npz["semantic_prompt"]
coarse_tokens = history_prompt_npz["coarse_prompt"]
fine_tokens = history_prompt_npz["fine_prompt"]

next_segment_history_prompt = {
 "semantic_prompt": semantic_tokens,
 "coarse_prompt": coarse_tokens,
 "fine_prompt": fine_tokens,
}


for i, piece in enumerate(pieces):

 print(f"i: {i} piece: {piece}")

 semantic_tokens = generate_text_semantic(
 piece,
 history_prompt=next_segment_history_prompt,
 temp=GEN_TEMP,
 min_eos_p=0.05, # this controls how likely the generation is to end
 )

 # need to save for coarse? first try without. just using base histotry for coarse, but with semantic toekes preogressive

 new_semantic = np.hstack([x_semantic_history, x_semantic]).astype(np.int32)


 next_segment_history_prompt["semantic_prompt"] = semantic_tokens
 
 semantic_tokens_full.append(semantic_tokens)
"""
#Assuming x_semantic_history and x_semantic are already defined




print (f" full len: {len(semantic_tokens)}")

split_semantic_tokens = split_array_equally(semantic_tokens, 8)

for i, coarse_semantic_tokens in enumerate(split_semantic_tokens):
 print(f"length of coarse_semantic_tokens {i + 1}: {len(coarse_semantic_tokens)}")
 audio_array = semantic_to_waveform(coarse_semantic_tokens, history_prompt=SPEAKER)
 final_pieces += [audio_array, silence.copy()]



coarse_end_time = time.time()
elapsed_time = coarse_end_time - semantic_end_time


time_finished = f"coarse finished Finished at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(coarse_end_time))}"
time_taken = f"in {elapsed_time} seconds"
print(f" -->{time_finished} {time_taken}")

Audio(np.concatenate(final_pieces), rate=SAMPLE_RATE)

# $ \\ $

# Make a Long-Form Dialog with Bark

### Step 1: Format a script and speaker lookup

In [None]:
speaker_lookup = {"Samantha": "v2/en_speaker_9", "John": "v2/en_speaker_2"}

# Script generated by chat GPT
script = """
Samantha: Hey, have you heard about this new text-to-audio model called "Bark"?

John: No, I haven't. What's so special about it?

Samantha: Well, apparently it's the most realistic and natural-sounding text-to-audio model out there right now. People are saying it sounds just like a real person speaking.

John: Wow, that sounds amazing. How does it work?

Samantha: I think it uses advanced machine learning algorithms to analyze and understand the nuances of human speech, and then replicates those nuances in its own speech output.

John: That's pretty impressive. Do you think it could be used for things like audiobooks or podcasts?

Samantha: Definitely! In fact, I heard that some publishers are already starting to use Bark to create audiobooks. And I bet it would be great for podcasts too.

John: I can imagine. It would be like having your own personal voiceover artist.

Samantha: Exactly! I think Bark is going to be a game-changer in the world of text-to-audio technology."""
script = script.strip().split("\n")
script = [s.strip() for s in script if s]
script

### Step 2: Generate the audio for every speaker turn

In [None]:
pieces = []
silence = np.zeros(int(0.5*SAMPLE_RATE))
for line in script:
 speaker, text = line.split(": ")
 audio_array = generate_audio(text, history_prompt=speaker_lookup[speaker], )
 pieces += [audio_array, silence.copy()]

### Step 3: Concatenate all of the audio and play it

In [None]:
Audio(np.concatenate(pieces), rate=SAMPLE_RATE)