|
import torch |
|
from IPython.display import Audio |
|
from audiodiffusion import AudioDiffusion, AudioDiffusionPipeline |
|
from audiodiffusion.audio_encoder import AudioEncoder |
|
import librosa |
|
import librosa.display |
|
import numpy as np |
|
import random |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
model_name = ["SAint7579/orpheus_ldm_model_v1-0", "teticio/audio-diffusion-ddim-256"] |
|
|
|
audio_diffusion_v0 = AudioDiffusionPipeline.from_pretrained("teticio/latent-audio-diffusion-256").to(device) |
|
audio_diffusion_v1 = AudioDiffusionPipeline.from_pretrained("SAint7579/orpheus_ldm_model_v1-0").to(device) |
|
audio_diffusion_v2 = AudioDiffusionPipeline.from_pretrained("SAint7579/orpheus_ldm_model_v2-0").to(device) |
|
ddim = AudioDiffusionPipeline.from_pretrained("teticio/audio-diffusion-ddim-256").to(device) |
|
|
|
|
|
def generate_from_music(song_array, diffuser, start_step, total_steps=100, device=device): |
|
""" |
|
Generates audio from a given song array using a given diffuser. |
|
Parameters |
|
---------- |
|
song_array : numpy.ndarray |
|
The song array to use as the raw audio. |
|
diffuser : AudioDiffusionPipeline |
|
The diffuser to use to generate the audio. |
|
start_step : int |
|
The step to start generating from. |
|
total_steps : int |
|
The total number of steps to generate. |
|
device : str |
|
The device to use for generation. |
|
Returns |
|
------- |
|
numpy.ndarray |
|
The generated audio. |
|
""" |
|
generator = torch.Generator(device=device) |
|
generator.seed() |
|
output = diffuser(raw_audio=song_array, generator = generator, start_step=start_step, steps=total_steps) |
|
return output.images[0], output.audios[0, 0] |
|
|
|
def generate_from_music_long(song_array, diffuser, start_step, total_steps=100, device=device): |
|
""" |
|
Generates a 10 second audio from a given song array using a given diffuser. |
|
Parameters |
|
---------- |
|
song_array : numpy.ndarray |
|
The song array to use as the raw audio. |
|
diffuser : AudioDiffusionPipeline |
|
The diffuser to use to generate the audio. |
|
start_step : int |
|
The step to start generating from. |
|
total_steps : int |
|
The total number of steps to generate. |
|
device : str |
|
The device to use for generation. |
|
Returns |
|
------- |
|
numpy.ndarray |
|
The generated audio. |
|
""" |
|
generator = torch.Generator(device=device) |
|
generator.seed() |
|
output = diffuser(raw_audio=song_array, generator = generator, start_step=start_step, steps=total_steps) |
|
|
|
|
|
track = output.audios[0, 0] |
|
sample_rate = diffuser.mel.get_sample_rate() |
|
overlap_secs = 2 |
|
overlap_samples = overlap_secs * sample_rate |
|
|
|
continue_output = diffuser(raw_audio=track[-overlap_samples:], |
|
generator=generator, |
|
start_step=start_step, |
|
mask_start_secs=overlap_secs) |
|
|
|
audio2 = continue_output.audios[0, 0] |
|
track = np.concatenate([track, audio2[overlap_samples:]]) |
|
|
|
return output.images[0], track |
|
|
|
|
|
|
|
def iterative_slerp(song_arrays, ddim, steps=10): |
|
"""Iterative slerp function. |
|
|
|
Parameters |
|
---------- |
|
song_arrays : list |
|
List of song arrays to slerp. |
|
ddim : AudioDiffusion ddim model |
|
AudioDiffusion object. |
|
|
|
Returns |
|
------- |
|
slerp : torch.Tensor |
|
Slerped tensor. |
|
""" |
|
noise = [] |
|
for arr in song_arrays: |
|
ddim.mel.audio = arr |
|
noise.append(ddim.encode([ddim.mel.audio_slice_to_image(0)], steps=steps)) |
|
|
|
slerp = noise[0] |
|
for i in range(1, len(noise)): |
|
slerp = ddim.slerp(slerp, noise[i], 0.5) |
|
|
|
return slerp |
|
|
|
def merge_songs(song_arrays, ddim, slerp_steps=10, diffusion_steps=100, device=device): |
|
"""Merge songs. |
|
|
|
Parameters |
|
---------- |
|
song_arrays : list |
|
List of song arrays to merge. |
|
ddim : AudioDiffusion ddim model |
|
AudioDiffusion object. |
|
|
|
Returns |
|
------- |
|
spectrogram : np.ndarray |
|
Merged spectrogram. |
|
audio : np.ndarray |
|
Merged audio. |
|
""" |
|
generator = torch.Generator(device=device) |
|
generator.manual_seed(7579) |
|
slerp = iterative_slerp(song_arrays, ddim, slerp_steps) |
|
merged = ddim(noise=slerp, generator=generator, steps=diffusion_steps) |
|
return merged.images[0], merged.audios[0, 0] |
|
|
|
|
|
def generate_songs(conditioning_songs, similarity=0.9, quality=500, merging_quality=100, device=device): |
|
"""Generate songs. |
|
|
|
Parameters |
|
---------- |
|
conditioning_songs : list |
|
List of conditioning songs. |
|
similarity : float |
|
Similarity between conditioning songs. |
|
quality : int |
|
Quality of generated song. |
|
|
|
Returns |
|
------- |
|
spec_generated : np.ndarray |
|
Spectrogram of generated song. |
|
generated : np.ndarray |
|
Generated song. |
|
""" |
|
|
|
print("Merging songs...") |
|
if len(conditioning_songs)>1: |
|
|
|
|
|
|
|
spec_merged, merged = merge_songs(conditioning_songs, ddim, slerp_steps=merging_quality, diffusion_steps=merging_quality, device=device) |
|
else: |
|
merged = conditioning_songs[0] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
diffuser, model_name = random.choice([ |
|
(audio_diffusion_v0, "v0"), |
|
(audio_diffusion_v1, "v1"), |
|
(audio_diffusion_v2, "v2") |
|
]) |
|
|
|
print("Generating song...") |
|
|
|
total_steps = min([1000, int(quality/(1-similarity))]) |
|
start_step = int(total_steps*similarity) |
|
spec_generated, generated = generate_from_music(merged, diffuser, start_step=start_step, total_steps=total_steps, device=device) |
|
|
|
return spec_generated, generated, model_name |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|