Spaces:
Running
on
Zero
Running
on
Zero
File size: 15,924 Bytes
b6ff5af 8a7e9fd ff54f69 3c1e68c ed7d0fe b6ff5af 8bdf8d9 b6ff5af 8bdf8d9 90d8e0f 140426b 8bdf8d9 72a3d04 140426b bd9caa9 717ff8a 8bdf8d9 717ff8a 8bdf8d9 717ff8a 8bdf8d9 717ff8a 8bdf8d9 72a3d04 40a916f 58fc3d4 ed7d0fe 3c1e68c 58fc3d4 ed7d0fe 3c1e68c ed7d0fe 3c1e68c 58fc3d4 ed7d0fe 58fc3d4 3c1e68c ed7d0fe 58fc3d4 3c1e68c 58fc3d4 ed7d0fe 58fc3d4 ed7d0fe 58fc3d4 ed7d0fe 58fc3d4 3c1e68c 58fc3d4 3c1e68c b6ff5af 46b3885 b6ff5af ed7d0fe b6ff5af ed7d0fe 46b3885 b6ff5af 140426b 5098605 140426b 8bdf8d9 5098605 8bdf8d9 140426b b6ff5af 140426b ed7d0fe 8bdf8d9 3c1e68c 58fc3d4 3c1e68c b6ff5af 5098605 58fc3d4 b6ff5af |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 |
import gradio as gr
from musiclang_predict import MusicLangPredictor
import random
import subprocess
import os
import torchaudio
import torch
import numpy as np
from audiocraft.models import MusicGen
from audiocraft.data.audio import audio_write
from pydub import AudioSegment
import spaces
import tempfile
from pydub import AudioSegment
import io
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Utility Functions
def peak_normalize(y, target_peak=0.97):
return target_peak * (y / np.max(np.abs(y)))
def rms_normalize(y, target_rms=0.05):
return y * (target_rms / np.sqrt(np.mean(y**2)))
def preprocess_audio(waveform):
waveform_np = waveform.cpu().squeeze().numpy() # Move to CPU before converting to NumPy
# processed_waveform_np = rms_normalize(peak_normalize(waveform_np))
return torch.from_numpy(waveform_np).unsqueeze(0).to(device)
def create_slices(song, sr, slice_duration, bpm, num_slices=5):
song_length = song.shape[-1] / sr
slices = []
# Ensure the first slice is from the beginning of the song
first_slice_waveform = song[..., :int(slice_duration * sr)]
slices.append(first_slice_waveform)
for i in range(1, num_slices):
possible_start_indices = list(range(int(slice_duration * sr), int(song_length * sr), int(4 * 60 / bpm * sr)))
if not possible_start_indices:
# If there are no valid start indices, duplicate the first slice
slices.append(first_slice_waveform)
continue
random_start = random.choice(possible_start_indices)
slice_end = random_start + int(slice_duration * sr)
if slice_end > song_length * sr:
# Wrap around to the beginning of the song
remaining_samples = int(slice_end - song_length * sr)
slice_waveform = torch.cat([song[..., random_start:], song[..., :remaining_samples]], dim=-1)
else:
slice_waveform = song[..., random_start:slice_end]
if len(slice_waveform.squeeze()) < int(slice_duration * sr):
additional_samples_needed = int(slice_duration * sr) - len(slice_waveform.squeeze())
slice_waveform = torch.cat([slice_waveform, song[..., :additional_samples_needed]], dim=-1)
slices.append(slice_waveform)
return slices
def calculate_duration(bpm, min_duration=29, max_duration=30):
single_bar_duration = 4 * 60 / bpm
bars = max(min_duration // single_bar_duration, 1)
while single_bar_duration * bars < min_duration:
bars += 1
duration = single_bar_duration * bars
while duration > max_duration and bars > 1:
bars -= 1
duration = single_bar_duration * bars
return duration
@spaces.GPU(duration=60)
def generate_midi(seed, use_chords, chord_progression, bpm):
if seed == "":
seed = random.randint(1, 10000)
ml = MusicLangPredictor('musiclang/musiclang-v2')
try:
seed = int(seed)
except ValueError:
seed = random.randint(1, 10000)
nb_tokens = 1024
temperature = 0.9
top_p = 1.0
if use_chords and chord_progression.strip():
score = ml.predict_chords(
chord_progression,
time_signature=(4, 4),
temperature=temperature,
topp=top_p,
rng_seed=seed
)
else:
score = ml.predict(
nb_tokens=nb_tokens,
temperature=temperature,
topp=top_p,
rng_seed=seed
)
midi_filename = f"output_{seed}.mid"
wav_filename = midi_filename.replace(".mid", ".wav")
score.to_midi(midi_filename, tempo=bpm, time_signature=(4, 4))
subprocess.run(["fluidsynth", "-ni", "font.sf2", midi_filename, "-F", wav_filename, "-r", "44100"])
# Clean up temporary MIDI file
os.remove(midi_filename)
sample_rate = 44100 # Assuming fixed sample rate from fluidsynth command
return wav_filename
@spaces.GPU(duration=90)
def generate_music(wav_filename, prompt_duration, musicgen_model, num_iterations, bpm):
# Load the audio from the passed file path
song, sr = torchaudio.load(wav_filename)
song = song.to(device)
# Use the user-provided BPM value for duration calculation
duration = calculate_duration(bpm)
# Create slices from the song using the user-provided BPM value
slices = create_slices(song, sr, 35, bpm, num_slices=5)
# Load the model
model_name = musicgen_model.split(" ")[0]
model_continue = MusicGen.get_pretrained(model_name)
# Setting generation parameters
model_continue.set_generation_params(
use_sampling=True,
top_k=250,
top_p=0.0,
temperature=1.0,
duration=duration,
cfg_coef=3
)
all_audio_files = []
for i in range(num_iterations):
slice_idx = i % len(slices)
print(f"Running iteration {i + 1} using slice {slice_idx}...")
prompt_waveform = slices[slice_idx][..., :int(prompt_duration * sr)]
prompt_waveform = preprocess_audio(prompt_waveform)
output = model_continue.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True)
output = output.cpu() # Move the output tensor back to CPU
# Make sure the output tensor has at most 2 dimensions
if len(output.size()) > 2:
output = output.squeeze()
filename_without_extension = f'continue_{i}'
filename_with_extension = f'{filename_without_extension}.wav'
audio_write(filename_with_extension, output, model_continue.sample_rate, strategy="loudness", loudness_compressor=True)
all_audio_files.append(f'{filename_without_extension}.wav.wav') # Assuming the library appends an extra .wav
# Combine all audio files
combined_audio = AudioSegment.empty()
for filename in all_audio_files:
combined_audio += AudioSegment.from_wav(filename)
combined_audio_filename = f"combined_audio_{random.randint(1, 10000)}.mp3"
combined_audio.export(combined_audio_filename, format="mp3")
# Clean up temporary files
for filename in all_audio_files:
os.remove(filename)
return combined_audio_filename
@spaces.GPU(duration=90)
def continue_music(input_audio_path, prompt_duration, musicgen_model, num_iterations, bpm):
# Load the audio from the given file path
song, sr = torchaudio.load(input_audio_path)
song = song.to(device)
# Load the model and set generation parameters
model_continue = MusicGen.get_pretrained(musicgen_model.split(" ")[0])
model_continue.set_generation_params(
use_sampling=True,
top_k=250,
top_p=0.0,
temperature=1.0,
duration=calculate_duration(bpm),
cfg_coef=3
)
original_audio = AudioSegment.from_mp3(input_audio_path)
current_audio = original_audio
file_paths_for_cleanup = [] # List to track generated file paths for cleanup
for i in range(num_iterations):
# Calculate the slice from the end of the current audio based on prompt_duration
num_samples = int(prompt_duration * sr)
if current_audio.duration_seconds * 1000 < prompt_duration * 1000:
raise ValueError("The prompt_duration is longer than the current audio length.")
start_time = current_audio.duration_seconds * 1000 - prompt_duration * 1000
prompt_audio = current_audio[start_time:]
# Convert the prompt audio to a PyTorch tensor
prompt_bytes = prompt_audio.export(format="wav").read()
prompt_waveform, _ = torchaudio.load(io.BytesIO(prompt_bytes))
prompt_waveform = prompt_waveform.to(device)
# Prepare the audio slice for generation
prompt_waveform = preprocess_audio(prompt_waveform)
output = model_continue.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True)
output = output.cpu() # Move the output tensor back to CPU
if len(output.size()) > 2:
output = output.squeeze()
filename_without_extension = f'continue_{i}'
filename_with_extension = f'{filename_without_extension}.wav'
correct_filename_extension = f'{filename_without_extension}.wav.wav' # Apply the workaround for audio_write
audio_write(filename_with_extension, output, model_continue.sample_rate, strategy="loudness", loudness_compressor=True)
generated_audio_segment = AudioSegment.from_wav(correct_filename_extension)
# Replace the prompt portion with the generated audio
current_audio = current_audio[:start_time] + generated_audio_segment
file_paths_for_cleanup.append(correct_filename_extension) # Add to cleanup list
combined_audio_filename = f"combined_audio_{random.randint(1, 10000)}.mp3"
current_audio.export(combined_audio_filename, format="mp3")
# Clean up temporary files using the list of file paths
for file_path in file_paths_for_cleanup:
os.remove(file_path)
return combined_audio_filename
# Define the expandable sections
musiclang_blurb = """
## musiclang
musiclang is a controllable ai midi model. it can generate midi sequences based on user-provided parameters, or unconditionally.
[<img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub" width="20" style="vertical-align:middle"> musiclang github](https://github.com/MusicLang/musiclang_predict)
[<img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" alt="Hugging Face" width="20" style="vertical-align:middle"> musiclang huggingface space](https://huggingface.co/spaces/musiclang/musiclang-predict)
"""
musicgen_blurb = """
## musicgen
musicgen is a transformer-based music model that generates audio. It can also do something called a continuation, which was initially meant to extend musicgen outputs beyond 30 seconds. it can be used with any input audio to produce surprising results.
[<img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub" width="20" style="vertical-align:middle"> audiocraft github](https://github.com/facebookresearch/audiocraft)
visit https://thecollabagepatch.com/infinitepolo.mp3 or https://thecollabagepatch.com/audiocraft.mp3 to hear continuations in action.
see also https://youtube.com/@thecollabagepatch
"""
finetunes_blurb = """
## fine-tuned models
the fine-tunes hosted on the huggingface hub are provided collectively by the musicgen discord community. thanks to vanya, mj, hoenn, septicDNB and of course, lyra.
[<img src="https://cdn.iconscout.com/icon/free/png-256/discord-3691244-3073764.png" alt="Discord" width="20" style="vertical-align:middle"> musicgen discord](https://discord.gg/93kX8rGZ)
[<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" style="vertical-align:middle"> fine-tuning colab notebook by lyra](https://colab.research.google.com/drive/13tbcC3A42KlaUZ21qvUXd25SFLu8WIvb)
"""
# Define the fine-tunes blurb for each model
fine_tunes_info = """
## thepatch/vanya_ai_dnb_0.1
thepatch/vanya_ai_dnb_0.1 was trained by vanya. [![Twitter](https://huggingface.co/front/assets/huggingface_logo-noborder.svg)](https://twitter.com/@veryVANYA) . it treats almost all input audio as the beginning of a buildup to a dnb drop (can do downtempo well)
## thepatch/bleeps-medium
thepatch/bleeps-medium was trained by kevin and lyra [![Twitter](https://huggingface.co/front/assets/huggingface_logo-noborder.svg)](https://twitter.com/@_lyraaaa_) . it is a medium model. it's more melodic and ambient sometimes than vanya's, but there's a 50/50 chance it gets real heavy with the edm vibes. It can be amazing at turning your chords into pads, and is a good percussionist.
## thepatch/budots_remix
thepatch/budots_remix was trained by MJ BERSABEph. budots is a dope niche genre from the philippines apparently. this one will often do fascinating, demonic, kinds of vocal chopping. warning: it tends to speed up and slow down tempo, which makes it hard to use in a daw.
## thepatch/hoenn_lofi
thepatch/hoenn_lofi is a large fine-tune by hoenn. [![Twitter](https://huggingface.co/front/assets/huggingface_logo-noborder.svg)](https://twitter.com/@eschatolocation) . this model is a large boi, and it shows. even tho it is trained to do lo-fi, its ability to run with your melodies and not ruin them is unparalleled among the fine-tunes so far.
## thepatch/PhonkV2
thepatch/PhonkV2 was trained by MJ BERSABEph. there are multiple versions in the discord.
"""
# Create the Gradio interface
with gr.Blocks() as iface:
gr.Markdown("# the-slot-machine")
gr.Markdown("two ai's jamming. warning: outputs will be very strange, likely stupid, and possibly rad.")
gr.Markdown("this is a musical slot machine. using musiclang, we get a midi output. then, we let a musicgen model continue, semi-randomly, from different sections of the midi track. the slot machine combines em all at the end into something very bizarre. pick a number for the seed between 1 and 10k, or leave it blank to unlock the full rnjesus powers. if you wanna be lame, you can control the chord progression, prompt duration, musicgen model, number of iterations, and BPM.")
with gr.Accordion("more info", open=False):
gr.Markdown(musiclang_blurb)
gr.Markdown(musicgen_blurb)
gr.Markdown(finetunes_blurb)
with gr.Accordion("fine-tunes info", open=False):
gr.Markdown(fine_tunes_info)
with gr.Row():
with gr.Column():
seed = gr.Textbox(label="Seed (leave blank for random)", value="")
use_chords = gr.Checkbox(label="Control Chord Progression", value=False)
chord_progression = gr.Textbox(label="Chord Progression (e.g., Am CM Dm E7 Am)", visible=True)
bpm = gr.Slider(label="BPM", minimum=60, maximum=200, step=1, value=120)
generate_midi_button = gr.Button("Generate MIDI")
midi_audio = gr.Audio(label="Generated MIDI Audio", type="filepath") # Ensure this is set to handle file paths
with gr.Column():
prompt_duration = gr.Dropdown(label="Prompt Duration (seconds)", choices=list(range(1, 11)), value=5)
musicgen_model = gr.Dropdown(label="MusicGen Model", choices=[
"thepatch/vanya_ai_dnb_0.1 (small)",
"thepatch/budots_remix (small)",
"thepatch/PhonkV2 (small)",
"thepatch/bleeps-medium (medium)",
"thepatch/hoenn_lofi (large)"
], value="thepatch/vanya_ai_dnb_0.1 (small)")
num_iterations = gr.Slider(label="this does nothing rn", minimum=1, maximum=1, step=1, value=1)
generate_music_button = gr.Button("Generate Music")
output_audio = gr.Audio(label="Generated Music", type="filepath")
continue_button = gr.Button("Continue Generating Music")
continue_output_audio = gr.Audio(label="Continued Music Output", type="filepath")
# Connecting the components
generate_midi_button.click(generate_midi, inputs=[seed, use_chords, chord_progression, bpm], outputs=[midi_audio])
generate_music_button.click(generate_music, inputs=[midi_audio, prompt_duration, musicgen_model, num_iterations, bpm], outputs=[output_audio])
continue_button.click(continue_music, inputs=[output_audio, prompt_duration, musicgen_model, num_iterations, bpm], outputs=continue_output_audio)
iface.launch() |