Spaces:
Runtime error
Runtime error
!nvidia-smi | |
import jukebox | |
import torch as t | |
import librosa | |
import os | |
from IPython.display import Audio | |
from jukebox.make_models import make_vqvae, make_prior, MODELS, make_model | |
from jukebox.hparams import Hyperparams, setup_hparams | |
from jukebox.sample import sample_single_window, _sample, \ | |
sample_partial_window, upsample | |
from jukebox.utils.dist_utils import setup_dist_from_mpi | |
from jukebox.utils.torch_utils import empty_cache | |
rank, local_rank, device = setup_dist_from_mpi() | |
model = "5b_lyrics" # or "1b_lyrics" | |
hps = Hyperparams() | |
hps.sr = 44100 | |
hps.n_samples = 3 if model=='5b_lyrics' else 8 | |
hps.name = 'samples' | |
chunk_size = 16 if model=="5b_lyrics" else 32 | |
max_batch_size = 3 if model=="5b_lyrics" else 16 | |
hps.levels = 3 | |
hps.hop_fraction = [.5,.5,.125] | |
vqvae, *priors = MODELS[model] | |
vqvae = make_vqvae(setup_hparams(vqvae, dict(sample_length = 1048576)), device) | |
top_prior = make_prior(setup_hparams(priors[-1], dict()), vqvae, device) | |
sample_length_in_seconds = 60 # Full length of musical sample to generate - we find songs in the 1 to 4 minute | |
# range work well, with generation time proportional to sample length. | |
# This total length affects how quickly the model | |
# progresses through lyrics (model also generates differently | |
# depending on if it thinks it's in the beginning, middle, or end of sample) | |
hps.sample_length = (int(sample_length_in_seconds*hps.sr)//top_prior.raw_to_tokens)*top_prior.raw_to_tokens | |
assert hps.sample_length >= top_prior.n_ctx*top_prior.raw_to_tokens, f'Please choose a larger sampling rate' | |
metas = [dict(artist = "Zac Brown Band", | |
genre = "Country", | |
total_length = hps.sample_length, | |
offset = 0, | |
lyrics = """I met a traveller from an antique land, | |
Who said—“Two vast and trunkless legs of stone | |
Stand in the desert. . . . Near them, on the sand, | |
Half sunk a shattered visage lies, whose frown, | |
And wrinkled lip, and sneer of cold command, | |
Tell that its sculptor well those passions read | |
Which yet survive, stamped on these lifeless things, | |
The hand that mocked them, and the heart that fed; | |
And on the pedestal, these words appear: | |
My name is Ozymandias, King of Kings; | |
Look on my Works, ye Mighty, and despair! | |
Nothing beside remains. Round the decay | |
Of that colossal Wreck, boundless and bare | |
The lone and level sands stretch far away | |
""", | |
), | |
] * hps.n_samples | |
labels = [None, None, top_prior.labeller.get_batch_labels(metas, 'cuda')] | |
# Set this False if you are on a local machine that has enough memory (this allows you to do the | |
# lyrics alignment visualization during the upsampling stage). For a hosted runtime, | |
# we'll need to go ahead and delete the top_prior if you are using the 5b_lyrics model. | |
if True: | |
del top_prior | |
empty_cache() | |
top_prior=None | |
upsamplers = [make_prior(setup_hparams(prior, dict()), vqvae, 'cpu') for prior in priors[:-1]] | |
labels[:2] = [prior.labeller.get_batch_labels(metas, 'cuda') for prior in upsamplers] | |
zs = upsample(zs, labels, sampling_kwargs, [*upsamplers, top_prior], hps) | |
Audio(f'{hps.name}/level_2/item_0.wav') | |
sampling_temperature = .98 | |
lower_batch_size = 16 | |
max_batch_size = 3 if model == "5b_lyrics" else 16 | |
lower_level_chunk_size = 32 | |
chunk_size = 16 if model == "5b_lyrics" else 32 | |
sampling_kwargs = [dict(temp=.99, fp16=True, max_batch_size=lower_batch_size, | |
chunk_size=lower_level_chunk_size), | |
dict(temp=0.99, fp16=True, max_batch_size=lower_batch_size, | |
chunk_size=lower_level_chunk_size), | |
dict(temp=sampling_temperature, fp16=True, | |
max_batch_size=max_batch_size, chunk_size=chunk_size)] | |
del upsamplers | |
empty_cache() | |
Audio(f'{hps.name}/level_0/item_0.wav') | |