File size: 6,877 Bytes
36ad38b 426d4b0 36ad38b 426d4b0 36ad38b 426d4b0 36ad38b 426d4b0 36ad38b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 |
import torch
from IPython.display import Audio
from audiodiffusion import AudioDiffusion, AudioDiffusionPipeline
from audiodiffusion.audio_encoder import AudioEncoder
import librosa
import librosa.display
import numpy as np
import random
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = ["SAint7579/orpheus_ldm_model_v1-0", "teticio/audio-diffusion-ddim-256"]
audio_diffusion_v0 = AudioDiffusionPipeline.from_pretrained("teticio/latent-audio-diffusion-256").to(device)
audio_diffusion_v1 = AudioDiffusionPipeline.from_pretrained("SAint7579/orpheus_ldm_model_v1-0").to(device)
ddim = AudioDiffusionPipeline.from_pretrained("teticio/audio-diffusion-ddim-256").to(device)
### Add numpy docstring to generate_from_music
def generate_from_music(song_array, diffuser, start_step, total_steps=100, device=device):
"""
Generates audio from a given song array using a given diffuser.
Parameters
----------
song_array : numpy.ndarray
The song array to use as the raw audio.
diffuser : AudioDiffusionPipeline
The diffuser to use to generate the audio.
start_step : int
The step to start generating from.
total_steps : int
The total number of steps to generate.
device : str
The device to use for generation.
Returns
-------
numpy.ndarray
The generated audio.
"""
generator = torch.Generator(device=device)
generator.seed()
output = diffuser(raw_audio=song_array, generator = generator, start_step=start_step, steps=total_steps)
return output.images[0], output.audios[0, 0]
def generate_from_music_long(song_array, diffuser, start_step, total_steps=100, device=device):
"""
Generates a 10 second audio from a given song array using a given diffuser.
Parameters
----------
song_array : numpy.ndarray
The song array to use as the raw audio.
diffuser : AudioDiffusionPipeline
The diffuser to use to generate the audio.
start_step : int
The step to start generating from.
total_steps : int
The total number of steps to generate.
device : str
The device to use for generation.
Returns
-------
numpy.ndarray
The generated audio.
"""
generator = torch.Generator(device=device)
generator.seed()
output = diffuser(raw_audio=song_array, generator = generator, start_step=start_step, steps=total_steps)
# Get the track and use the diffuser again to create the continuation
track = output.audios[0, 0]
sample_rate = diffuser.mel.get_sample_rate()
overlap_secs = 2
overlap_samples = overlap_secs * sample_rate
continue_output = diffuser(raw_audio=track[-overlap_samples:],
generator=generator,
start_step=start_step,
mask_start_secs=overlap_secs)
# image2 = output.images[0]
audio2 = continue_output.audios[0, 0]
track = np.concatenate([track, audio2[overlap_samples:]])
return output.images[0], track
## Add docstring to iterative_slerp function in numpy format
def iterative_slerp(song_arrays, ddim, steps=10):
"""Iterative slerp function.
Parameters
----------
song_arrays : list
List of song arrays to slerp.
ddim : AudioDiffusion ddim model
AudioDiffusion object.
Returns
-------
slerp : torch.Tensor
Slerped tensor.
"""
noise = []
for arr in song_arrays:
ddim.mel.audio = arr
noise.append(ddim.encode([ddim.mel.audio_slice_to_image(0)], steps=steps))
slerp = noise[0]
for i in range(1, len(noise)):
slerp = ddim.slerp(slerp, noise[i], 0.5)
return slerp
def merge_songs(song_arrays, ddim, slerp_steps=10, diffusion_steps=100, device=device):
"""Merge songs.
Parameters
----------
song_arrays : list
List of song arrays to merge.
ddim : AudioDiffusion ddim model
AudioDiffusion object.
Returns
-------
spectrogram : np.ndarray
Merged spectrogram.
audio : np.ndarray
Merged audio.
"""
generator = torch.Generator(device=device)
generator.manual_seed(7579)
slerp = iterative_slerp(song_arrays, ddim, slerp_steps)
merged = ddim(noise=slerp, generator=generator, steps=diffusion_steps)
return merged.images[0], merged.audios[0, 0]
## Write generate songs function with numpy docstring
def generate_songs(conditioning_songs, similarity=0.9, quality=500, merging_quality=100, device=device):
"""Generate songs.
Parameters
----------
conditioning_songs : list
List of conditioning songs.
similarity : float
Similarity between conditioning songs.
quality : int
Quality of generated song.
Returns
-------
spec_generated : np.ndarray
Spectrogram of generated song.
generated : np.ndarray
Generated song.
"""
## Merging songs
print("Merging songs...")
if len(conditioning_songs)>1:
# print(conditioning_songs)
# for c in conditioning_songs:
# print(len(c))
spec_merged, merged = merge_songs(conditioning_songs, ddim, slerp_steps=merging_quality, diffusion_steps=merging_quality, device=device)
else:
merged = conditioning_songs[0]
## Take a random 10 second slice from the merged song
# sample_rate = ddim.mel.get_sample_rate()
# start = np.random.randint(0, len(merged) - 5 * sample_rate)
# merged = merged[start:start + 5 * sample_rate]
if random.random() < 0.5:
diffuser = audio_diffusion_v0
model_name = "v0"
else:
diffuser = audio_diffusion_v1
model_name = "v1"
print("Generating song...")
## quality = X - similarity*X
total_steps = min([1000, int(quality/(1-similarity))])
start_step = int(total_steps*similarity)
spec_generated, generated = generate_from_music(merged, diffuser, start_step=start_step, total_steps=total_steps, device=device)
return spec_generated, generated, model_name
# if __name__ == '__main__':
# song1 = "D:/Projects/Orpheus_ai/DataSet/Sep_Dataset/accompaniment/000010.mp3"
# song2 = "D:/Projects/Orpheus_ai/DataSet/Sep_Dataset/accompaniment/000002.mp3"
# song3 = "D:/Projects/Orpheus_ai/DataSet/Sep_Dataset/accompaniment/000003.mp3"
# song_array_1, sr = librosa.load(song1, sr=22050)
# song_array_1 = song_array_1[:sr*5]
# song_array_2, sr = librosa.load(song2, sr=22050)
# song_array_2 = song_array_2[:sr*10]
# song_array_3, sr = librosa.load(song3, sr=22050)
# song_array_3 = song_array_3[:sr*10]
# generator = torch.Generator(device=device)
# # seed = 239150437427
# image, audio = generate_songs([song_array_2, song_array_3], similarity=0.5, quality=10, device=device)
# Audio(audio, rate=sr) |