thecollabagepatch commited on
Commit
c684cf6
Β·
1 Parent(s): 8ecf4fa

first attempt here

Browse files
Files changed (4) hide show
  1. .gitmodules +3 -3
  2. app.py +435 -319
  3. requirements.txt +22 -7
  4. stable-audio-tools +1 -0
.gitmodules CHANGED
@@ -1,3 +1,3 @@
1
- [submodule "audiocraft"]
2
- path = audiocraft
3
- url = https://github.com/facebookresearch/audiocraft
 
1
+ [submodule "stable-audio-tools"]
2
+ path = stable-audio-tools
3
+ url = https://github.com/Stability-AI/stable-audio-tools.git
app.py CHANGED
@@ -1,344 +1,460 @@
1
  import gradio as gr
2
  import spaces
3
- from musiclang_predict import MusicLangPredictor
4
- import random
5
- import subprocess
6
- import os
7
- import torchaudio
8
  import torch
9
- import numpy as np
10
- from audiocraft.models import MusicGen
11
- from audiocraft.data.audio import audio_write
12
- from pydub import AudioSegment
13
-
14
- import tempfile
15
- from pydub import AudioSegment
16
  import io
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- # Check if CUDA is available
19
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
20
-
21
- # Utility Functions
22
- def peak_normalize(y, target_peak=0.97):
23
- return target_peak * (y / np.max(np.abs(y)))
24
-
25
- def rms_normalize(y, target_rms=0.05):
26
- return y * (target_rms / np.sqrt(np.mean(y**2)))
27
-
28
- def preprocess_audio(waveform):
29
- waveform_np = waveform.cpu().squeeze().numpy() # Move to CPU before converting to NumPy
30
- # processed_waveform_np = rms_normalize(peak_normalize(waveform_np))
31
- return torch.from_numpy(waveform_np).unsqueeze(0).to(device)
32
-
33
- def create_slices(song, sr, slice_duration, bpm, num_slices=5):
34
- song_length = song.shape[-1] / sr
35
- slices = []
36
-
37
- # Ensure the first slice is from the beginning of the song
38
- first_slice_waveform = song[..., :int(slice_duration * sr)]
39
- slices.append(first_slice_waveform)
40
-
41
- for i in range(1, num_slices):
42
- possible_start_indices = list(range(int(slice_duration * sr), int(song_length * sr), int(4 * 60 / bpm * sr)))
43
- if not possible_start_indices:
44
- # If there are no valid start indices, duplicate the first slice
45
- slices.append(first_slice_waveform)
46
- continue
47
 
48
- random_start = random.choice(possible_start_indices)
49
- slice_end = random_start + int(slice_duration * sr)
 
 
50
 
51
- if slice_end > song_length * sr:
52
- # Wrap around to the beginning of the song
53
- remaining_samples = int(slice_end - song_length * sr)
54
- slice_waveform = torch.cat([song[..., random_start:], song[..., :remaining_samples]], dim=-1)
55
- else:
56
- slice_waveform = song[..., random_start:slice_end]
 
57
 
58
- if len(slice_waveform.squeeze()) < int(slice_duration * sr):
59
- additional_samples_needed = int(slice_duration * sr) - len(slice_waveform.squeeze())
60
- slice_waveform = torch.cat([slice_waveform, song[..., :additional_samples_needed]], dim=-1)
61
 
62
- slices.append(slice_waveform)
 
 
63
 
64
- return slices
65
-
66
- def calculate_duration(bpm, min_duration=29, max_duration=30):
67
- single_bar_duration = 4 * 60 / bpm
68
- bars = max(min_duration // single_bar_duration, 1)
69
-
70
- while single_bar_duration * bars < min_duration:
71
- bars += 1
72
-
73
- duration = single_bar_duration * bars
74
-
75
- while duration > max_duration and bars > 1:
76
- bars -= 1
77
- duration = single_bar_duration * bars
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
- return duration
80
-
81
- @spaces.GPU(duration=60)
82
- def generate_midi(seed, use_chords, chord_progression, bpm):
83
- if seed == "":
84
- seed = random.randint(1, 10000)
85
-
86
- ml = MusicLangPredictor('musiclang/musiclang-v2')
87
 
 
 
88
  try:
89
- seed = int(seed)
90
- except ValueError:
91
- seed = random.randint(1, 10000)
92
-
93
- nb_tokens = 1024
94
- temperature = 0.9
95
- top_p = 1.0
96
-
97
- if use_chords and chord_progression.strip():
98
- score = ml.predict_chords(
99
- chord_progression,
100
- time_signature=(4, 4),
101
- temperature=temperature,
102
- topp=top_p,
103
- rng_seed=seed
104
- )
105
- else:
106
- score = ml.predict(
107
- nb_tokens=nb_tokens,
108
- temperature=temperature,
109
- topp=top_p,
110
- rng_seed=seed
111
- )
112
-
113
- midi_filename = f"output_{seed}.mid"
114
- wav_filename = midi_filename.replace(".mid", ".wav")
115
-
116
- score.to_midi(midi_filename, tempo=bpm, time_signature=(4, 4))
117
-
118
- subprocess.run(["fluidsynth", "-ni", "font.sf2", midi_filename, "-F", wav_filename, "-r", "44100"])
119
-
120
- # Clean up temporary MIDI file
121
- os.remove(midi_filename)
122
-
123
- sample_rate = 44100 # Assuming fixed sample rate from fluidsynth command
124
- return wav_filename
125
-
126
- @spaces.GPU(duration=120)
127
- def generate_music(wav_filename, prompt_duration, musicgen_model, num_iterations, bpm):
128
- # Load the audio from the passed file path
129
- song, sr = torchaudio.load(wav_filename)
130
- song = song.to(device)
131
- # Use the user-provided BPM value for duration calculation
132
- duration = calculate_duration(bpm)
133
-
134
- # Create slices from the song using the user-provided BPM value
135
- slices = create_slices(song, sr, 35, bpm, num_slices=5)
136
-
137
- # Load the model
138
- model_name = musicgen_model.split(" ")[0]
139
- model_continue = MusicGen.get_pretrained(model_name)
140
-
141
- # Setting generation parameters
142
- model_continue.set_generation_params(
143
- use_sampling=True,
144
- top_k=250,
145
- top_p=0.0,
146
- temperature=1.0,
147
- duration=duration,
148
- cfg_coef=3
149
- )
150
-
151
- all_audio_files = []
152
-
153
- for i in range(num_iterations):
154
- slice_idx = i % len(slices)
155
 
156
- print(f"Running iteration {i + 1} using slice {slice_idx}...")
 
 
 
 
157
 
158
- prompt_waveform = slices[slice_idx][..., :int(prompt_duration * sr)]
159
- prompt_waveform = preprocess_audio(prompt_waveform)
 
 
160
 
161
- output = model_continue.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True)
162
- output = output.cpu() # Move the output tensor back to CPU
163
 
164
- # Make sure the output tensor has at most 2 dimensions
165
- if len(output.size()) > 2:
166
- output = output.squeeze()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
- filename_without_extension = f'continue_{i}'
169
- filename_with_extension = f'{filename_without_extension}.wav'
170
 
171
- audio_write(filename_with_extension, output, model_continue.sample_rate, strategy="loudness", loudness_compressor=True)
172
- all_audio_files.append(f'{filename_without_extension}.wav.wav') # Assuming the library appends an extra .wav
173
-
174
- # Combine all audio files
175
- combined_audio = AudioSegment.empty()
176
- for filename in all_audio_files:
177
- combined_audio += AudioSegment.from_wav(filename)
178
-
179
- combined_audio_filename = f"combined_audio_{random.randint(1, 10000)}.mp3"
180
- combined_audio.export(combined_audio_filename, format="mp3")
181
-
182
- # Clean up temporary files
183
- for filename in all_audio_files:
184
- os.remove(filename)
185
-
186
- return combined_audio_filename
187
-
188
- @spaces.GPU(duration=120)
189
- def continue_music(input_audio_path, prompt_duration, musicgen_model, num_iterations, bpm):
190
- # Load the audio from the given file path
191
- song, sr = torchaudio.load(input_audio_path)
192
- song = song.to(device)
193
-
194
- # Load the model and set generation parameters
195
- model_continue = MusicGen.get_pretrained(musicgen_model.split(" ")[0])
196
- model_continue.set_generation_params(
197
- use_sampling=True,
198
- top_k=250,
199
- top_p=0.0,
200
- temperature=1.0,
201
- duration=calculate_duration(bpm),
202
- cfg_coef=3
203
- )
204
-
205
- original_audio = AudioSegment.from_mp3(input_audio_path)
206
- current_audio = original_audio
207
-
208
- file_paths_for_cleanup = [] # List to track generated file paths for cleanup
209
-
210
- for i in range(num_iterations):
211
- # Calculate the slice from the end of the current audio based on prompt_duration
212
- num_samples = int(prompt_duration * sr)
213
- if current_audio.duration_seconds * 1000 < prompt_duration * 1000:
214
- raise ValueError("The prompt_duration is longer than the current audio length.")
215
-
216
- start_time = current_audio.duration_seconds * 1000 - prompt_duration * 1000
217
- prompt_audio = current_audio[start_time:]
218
-
219
- # Convert the prompt audio to a PyTorch tensor
220
- prompt_bytes = prompt_audio.export(format="wav").read()
221
- prompt_waveform, _ = torchaudio.load(io.BytesIO(prompt_bytes))
222
- prompt_waveform = prompt_waveform.to(device)
223
-
224
- # Prepare the audio slice for generation
225
- prompt_waveform = preprocess_audio(prompt_waveform)
226
-
227
- output = model_continue.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True)
228
- output = output.cpu() # Move the output tensor back to CPU
229
-
230
- if len(output.size()) > 2:
231
- output = output.squeeze()
232
-
233
- filename_without_extension = f'continue_{i}'
234
- filename_with_extension = f'{filename_without_extension}.wav'
235
- correct_filename_extension = f'{filename_without_extension}.wav.wav' # Apply the workaround for audio_write
236
-
237
- audio_write(filename_with_extension, output, model_continue.sample_rate, strategy="loudness", loudness_compressor=True)
238
- generated_audio_segment = AudioSegment.from_wav(correct_filename_extension)
239
-
240
- # Replace the prompt portion with the generated audio
241
- current_audio = current_audio[:start_time] + generated_audio_segment
242
-
243
- file_paths_for_cleanup.append(correct_filename_extension) # Add to cleanup list
244
-
245
- combined_audio_filename = f"combined_audio_{random.randint(1, 10000)}.mp3"
246
- current_audio.export(combined_audio_filename, format="mp3")
247
-
248
- # Clean up temporary files using the list of file paths
249
- for file_path in file_paths_for_cleanup:
250
- os.remove(file_path)
251
-
252
- return combined_audio_filename
253
-
254
-
255
-
256
- # Define the expandable sections
257
- musiclang_blurb = """
258
- ## musiclang
259
- musiclang is a controllable ai midi model. it can generate midi sequences based on user-provided parameters, or unconditionally.
260
- [<img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub" width="20" style="vertical-align:middle"> musiclang github](https://github.com/MusicLang/musiclang_predict)
261
- [<img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" alt="Hugging Face" width="20" style="vertical-align:middle"> musiclang huggingface space](https://huggingface.co/spaces/musiclang/musiclang-predict)
262
- """
263
-
264
- musicgen_blurb = """
265
- ## musicgen
266
- musicgen is a transformer-based music model that generates audio. It can also do something called a continuation, which was initially meant to extend musicgen outputs beyond 30 seconds. it can be used with any input audio to produce surprising results.
267
- [<img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub" width="20" style="vertical-align:middle"> audiocraft github](https://github.com/facebookresearch/audiocraft)
268
- visit https://thecollabagepatch.com/infinitepolo.mp3 or https://thecollabagepatch.com/audiocraft.mp3 to hear continuations in action.
269
- see also https://youtube.com/@thecollabagepatch
270
- """
271
-
272
- finetunes_blurb = """
273
- ## fine-tuned models
274
- the fine-tunes hosted on the huggingface hub are provided collectively by the musicgen discord community. thanks to vanya, mj, hoenn, septicDNB and of course, lyra.
275
- [<img src="https://cdn.iconscout.com/icon/free/png-256/discord-3691244-3073764.png" alt="Discord" width="20" style="vertical-align:middle"> musicgen discord](https://discord.gg/93kX8rGZ)
276
- [<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" style="vertical-align:middle"> fine-tuning colab notebook by lyra](https://colab.research.google.com/drive/13tbcC3A42KlaUZ21qvUXd25SFLu8WIvb)
277
- """
278
-
279
- # Define the fine-tunes blurb for each model
280
- fine_tunes_info = """
281
- ## thepatch/vanya_ai_dnb_0.1
282
- thepatch/vanya_ai_dnb_0.1 was trained by vanya. [vanya's Twitter](https://twitter.com/@veryVANYA) πŸ”— - it treats almost all input audio as the beginning of a buildup to a dnb drop (can do downtempo well)
283
-
284
- ## thepatch/bleeps-medium
285
- thepatch/bleeps-medium was trained by kevin and lyra [lyra's Twitter](https://twitter.com/@_lyraaaa_) πŸ”— - it is a medium model. it's more melodic and ambient sometimes than vanya's, but there's a 50/50 chance it gets real heavy with the edm vibes. It can be amazing at turning your chords into pads, and is a good percussionist.
286
-
287
- ## thepatch/budots_remix
288
- thepatch/budots_remix was trained by MJ BERSABEph. budots is a dope niche genre from the philippines apparently. this one will often do fascinating, demonic, kinds of vocal chopping. warning: it tends to speed up and slow down tempo, which makes it hard to use in a daw.
289
-
290
- ## thepatch/hoenn_lofi
291
- thepatch/hoenn_lofi is a large fine-tune by hoenn. [hoenn's Twitter](https://twitter.com/@eschatolocation) πŸ”— - this model is a large boi, and it shows. even tho it is trained to do lo-fi, its ability to run with your melodies and not ruin them is unparalleled among the fine-tunes so far.
292
-
293
- ## thepatch/PhonkV2
294
- thepatch/PhonkV2 was trained by MJ BERSABEph. there are multiple versions in the discord.
295
-
296
- ## foureyednymph/musicgen-sza-sos-small
297
- foureyednymph/musicgen-sza-sos-small was just trained by foureyednymph. We're all about to find out if it does continuations well.
298
- """
299
-
300
- # Create the Gradio interface
301
- with gr.Blocks() as iface:
302
- gr.Markdown("# the-slot-machine")
303
- gr.Markdown("two ai's jamming. warning: outputs will be very strange, likely stupid, and possibly rad.")
304
- gr.Markdown("this is a musical slot machine. using musiclang, we get a midi output. then, we let a musicgen model. trim it so that you like the beginning of the output, and choose the prompt duration. Then we give it to musicgen to continue for 30 seconds. We can then choose a new model and prompt duration, trim it, and give it to musicgen to continue from the end of the output. Re-upload, trim again and repeat with a new musicgen model and different prompt duration if you want. ")
305
-
306
- with gr.Accordion("more info", open=False):
307
- gr.Markdown(musiclang_blurb)
308
- gr.Markdown(musicgen_blurb)
309
- gr.Markdown(finetunes_blurb)
310
 
311
- with gr.Accordion("fine-tunes info", open=False):
312
- gr.Markdown(fine_tunes_info)
313
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  with gr.Row():
315
  with gr.Column():
316
- seed = gr.Textbox(label="Seed (leave blank for random)", value="")
317
- use_chords = gr.Checkbox(label="Control Chord Progression", value=False)
318
- chord_progression = gr.Textbox(label="Chord Progression (e.g., Am CM Dm E7 Am)", visible=True)
319
- bpm = gr.Slider(label="BPM", minimum=60, maximum=200, step=1, value=120)
320
- generate_midi_button = gr.Button("Generate MIDI")
321
- midi_audio = gr.Audio(label="Generated MIDI Audio", type="filepath") # Ensure this is set to handle file paths
322
-
323
  with gr.Column():
324
- prompt_duration = gr.Dropdown(label="Prompt Duration (seconds)", choices=list(range(1, 11)), value=5)
325
- musicgen_model = gr.Dropdown(label="MusicGen Model", choices=[
326
- "thepatch/vanya_ai_dnb_0.1 (small)",
327
- "thepatch/budots_remix (small)",
328
- "thepatch/PhonkV2 (small)",
329
- "thepatch/bleeps-medium (medium)",
330
- "thepatch/hoenn_lofi (large)",
331
- "foureyednymph/musicgen-sza-sos-small (small)"
332
- ], value="thepatch/vanya_ai_dnb_0.1 (small)")
333
- num_iterations = gr.Slider(label="this does nothing rn", minimum=1, maximum=1, step=1, value=1)
334
- generate_music_button = gr.Button("Generate Music")
335
- output_audio = gr.Audio(label="Generated Music", type="filepath")
336
- continue_button = gr.Button("Continue Generating Music")
337
- continue_output_audio = gr.Audio(label="Continued Music Output", type="filepath")
338
-
339
- # Connecting the components
340
- generate_midi_button.click(generate_midi, inputs=[seed, use_chords, chord_progression, bpm], outputs=[midi_audio])
341
- generate_music_button.click(generate_music, inputs=[midi_audio, prompt_duration, musicgen_model, num_iterations, bpm], outputs=[output_audio])
342
- continue_button.click(continue_music, inputs=[output_audio, prompt_duration, musicgen_model, num_iterations, bpm], outputs=continue_output_audio)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
 
344
- iface.launch()
 
 
1
  import gradio as gr
2
  import spaces
 
 
 
 
 
3
  import torch
4
+ import torchaudio
 
 
 
 
 
 
5
  import io
6
+ import base64
7
+ import uuid
8
+ import os
9
+ import time
10
+ import re
11
+ import threading
12
+ import gc
13
+ import random
14
+ import numpy as np
15
+ from einops import rearrange
16
+ from huggingface_hub import login
17
+ from stable_audio_tools import get_pretrained_model
18
+ from stable_audio_tools.inference.generation import generate_diffusion_cond
19
+ from gradio_client import Client, handle_file
20
+ from contextlib import contextmanager
21
+
22
+ # Global model storage
23
+ model_cache = {}
24
+ model_lock = threading.Lock()
25
+
26
+ @contextmanager
27
+ def resource_cleanup():
28
+ """Context manager to ensure proper cleanup of GPU resources."""
29
+ try:
30
+ yield
31
+ finally:
32
+ if torch.cuda.is_available():
33
+ torch.cuda.synchronize()
34
+ torch.cuda.empty_cache()
35
+ gc.collect()
36
+
37
+ def load_stable_audio_model():
38
+ """Load stable-audio-open-small model if not already loaded."""
39
+ with model_lock:
40
+ if 'stable_audio_model' not in model_cache:
41
+ print("πŸ”„ Loading stable-audio-open-small model...")
42
+
43
+ # Authenticate with HF
44
+ hf_token = os.getenv('HF_TOKEN')
45
+ if hf_token:
46
+ login(token=hf_token)
47
+ print(f"βœ… HF authenticated")
48
+
49
+ # Load model
50
+ model, config = get_pretrained_model("stabilityai/stable-audio-open-small")
51
+ device = "cuda" if torch.cuda.is_available() else "cpu"
52
+ model = model.to(device)
53
+ if device == "cuda":
54
+ model = model.half()
55
+
56
+ model_cache['stable_audio_model'] = model
57
+ model_cache['stable_audio_config'] = config
58
+ model_cache['stable_audio_device'] = device
59
+ print(f"βœ… Stable Audio model loaded on {device}")
60
+
61
+ return (model_cache['stable_audio_model'],
62
+ model_cache['stable_audio_config'],
63
+ model_cache['stable_audio_device'])
64
 
65
+ @spaces.GPU
66
+ def generate_stable_audio_loop(prompt, loop_type, bpm, bars, seed=-1):
67
+ """Generate a BPM-aware loop using stable-audio-open-small"""
68
+ try:
69
+ model, config, device = load_stable_audio_model()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
+ # Calculate loop duration based on BPM and bars
72
+ seconds_per_beat = 60.0 / bpm
73
+ seconds_per_bar = seconds_per_beat * 4 # 4/4 time
74
+ target_loop_duration = seconds_per_bar * bars
75
 
76
+ # Enhance prompt based on loop type and BPM
77
+ if loop_type == "drums":
78
+ enhanced_prompt = f"{prompt} drum loop {bpm}bpm"
79
+ negative_prompt = "melody, harmony, pitched instruments, vocals, singing"
80
+ else: # instruments
81
+ enhanced_prompt = f"{prompt} instrumental loop {bpm}bpm"
82
+ negative_prompt = "drums, percussion, kick, snare, hi-hat"
83
 
84
+ # Set seed
85
+ if seed == -1:
86
+ seed = random.randint(0, 2**32 - 1)
87
 
88
+ torch.manual_seed(seed)
89
+ if device == "cuda":
90
+ torch.cuda.manual_seed(seed)
91
 
92
+ print(f"🎡 Generating {loop_type} loop:")
93
+ print(f" Enhanced prompt: {enhanced_prompt}")
94
+ print(f" Target duration: {target_loop_duration:.2f}s ({bars} bars at {bpm}bpm)")
95
+ print(f" Seed: {seed}")
96
+
97
+ # Prepare conditioning
98
+ conditioning = [{
99
+ "prompt": enhanced_prompt,
100
+ "seconds_total": 12 # Model generates 12s max
101
+ }]
102
+
103
+ negative_conditioning = [{
104
+ "prompt": negative_prompt,
105
+ "seconds_total": 12
106
+ }]
107
+
108
+ start_time = time.time()
109
+
110
+ with resource_cleanup():
111
+ if device == "cuda":
112
+ torch.cuda.empty_cache()
113
+
114
+ with torch.cuda.amp.autocast(enabled=(device == "cuda")):
115
+ output = generate_diffusion_cond(
116
+ model,
117
+ steps=8, # Fast generation
118
+ cfg_scale=1.0, # Good balance for loops
119
+ conditioning=conditioning,
120
+ negative_conditioning=negative_conditioning,
121
+ sample_size=config["sample_size"],
122
+ sampler_type="pingpong",
123
+ device=device,
124
+ seed=seed
125
+ )
126
+
127
+ generation_time = time.time() - start_time
128
+
129
+ # Post-process audio
130
+ output = rearrange(output, "b d n -> d (b n)") # (2, N) stereo
131
+ output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1)
132
+
133
+ # Extract the loop portion
134
+ sample_rate = config["sample_rate"]
135
+ loop_samples = int(target_loop_duration * sample_rate)
136
+ available_samples = output.shape[1]
137
+
138
+ if loop_samples > available_samples:
139
+ loop_samples = available_samples
140
+ actual_duration = available_samples / sample_rate
141
+ print(f"⚠️ Requested {target_loop_duration:.2f}s, got {actual_duration:.2f}s")
142
+
143
+ # Extract loop from beginning (cleanest beat alignment)
144
+ loop_output = output[:, :loop_samples]
145
+ loop_output_int16 = loop_output.mul(32767).to(torch.int16).cpu()
146
+
147
+ # Save to temporary file
148
+ loop_filename = f"loop_{loop_type}_{bpm}bpm_{bars}bars_{seed}.wav"
149
+ torchaudio.save(loop_filename, loop_output_int16, sample_rate)
150
+
151
+ actual_duration = loop_samples / sample_rate
152
+ print(f"βœ… {loop_type.title()} loop generated: {actual_duration:.2f}s in {generation_time:.2f}s")
153
+
154
+ return loop_filename, f"Generated {actual_duration:.2f}s {loop_type} loop at {bpm}bpm ({bars} bars)"
155
 
156
+ except Exception as e:
157
+ print(f"❌ Generation error: {str(e)}")
158
+ return None, f"Error: {str(e)}"
 
 
 
 
 
159
 
160
+ def combine_loops(drums_audio, instruments_audio, bpm, bars, num_repeats):
161
+ """Combine drum and instrument loops with specified repetitions"""
162
  try:
163
+ if not drums_audio and not instruments_audio:
164
+ return None, "No audio files to combine"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
+ # Calculate timing
167
+ seconds_per_beat = 60.0 / bpm
168
+ seconds_per_bar = seconds_per_beat * 4
169
+ loop_duration = seconds_per_bar * bars
170
+ total_duration = loop_duration * num_repeats
171
 
172
+ print(f"πŸŽ›οΈ Combining loops:")
173
+ print(f" Loop duration: {loop_duration:.2f}s ({bars} bars)")
174
+ print(f" Repeats: {num_repeats}")
175
+ print(f" Total duration: {total_duration:.2f}s")
176
 
177
+ combined_audio = None
178
+ sample_rate = None
179
 
180
+ # Process each audio file
181
+ for audio_path, audio_type in [(drums_audio, "drums"), (instruments_audio, "instruments")]:
182
+ if audio_path:
183
+ # Load audio
184
+ waveform, sr = torchaudio.load(audio_path)
185
+ if sample_rate is None:
186
+ sample_rate = sr
187
+
188
+ # Ensure we have the exact loop duration
189
+ target_samples = int(loop_duration * sr)
190
+ if waveform.shape[1] > target_samples:
191
+ waveform = waveform[:, :target_samples]
192
+ elif waveform.shape[1] < target_samples:
193
+ # Pad if necessary
194
+ padding = target_samples - waveform.shape[1]
195
+ waveform = torch.cat([waveform, torch.zeros(waveform.shape[0], padding)], dim=1)
196
+
197
+ # Repeat the loop
198
+ repeated_waveform = waveform.repeat(1, num_repeats)
199
+
200
+ print(f" {audio_type}: {waveform.shape[1]/sr:.2f}s repeated {num_repeats}x = {repeated_waveform.shape[1]/sr:.2f}s")
201
+
202
+ # Add to combined audio
203
+ if combined_audio is None:
204
+ combined_audio = repeated_waveform
205
+ else:
206
+ combined_audio = combined_audio + repeated_waveform
207
 
208
+ if combined_audio is None:
209
+ return None, "No valid audio to combine"
210
 
211
+ # Normalize to prevent clipping
212
+ combined_audio = combined_audio / torch.max(torch.abs(combined_audio))
213
+ combined_audio = combined_audio.clamp(-1, 1)
214
+
215
+ # Convert to int16 and save
216
+ combined_audio_int16 = combined_audio.mul(32767).to(torch.int16)
217
+ combined_filename = f"combined_{bpm}bpm_{bars}bars_{num_repeats}loops_{random.randint(1000, 9999)}.wav"
218
+ torchaudio.save(combined_filename, combined_audio_int16, sample_rate)
219
+
220
+ actual_duration = combined_audio.shape[1] / sample_rate
221
+ status = f"Combined into {actual_duration:.2f}s audio ({num_repeats} Γ— {bars} bars at {bpm}bpm)"
222
+
223
+ print(f"βœ… {status}")
224
+ return combined_filename, status
225
+
226
+ except Exception as e:
227
+ print(f"❌ Combine error: {str(e)}")
228
+ return None, f"Combine error: {str(e)}"
229
+
230
+ def transform_with_melodyflow_api(audio_path, prompt, solver="euler", flowstep=0.12):
231
+ """Transform audio using Facebook/MelodyFlow space API"""
232
+ if audio_path is None:
233
+ return None, "❌ No audio file provided"
234
+
235
+ try:
236
+ # Initialize client for Facebook MelodyFlow space
237
+ client = Client("facebook/MelodyFlow")
238
+
239
+ # Set steps based on solver
240
+ if solver == "midpoint":
241
+ base_steps = 128
242
+ effective_steps = base_steps // 2 # 64 effective steps
243
+ else: # euler
244
+ base_steps = 125
245
+ effective_steps = base_steps // 5 # 25 effective steps
246
+
247
+ print(f"πŸŽ›οΈ MelodyFlow transformation:")
248
+ print(f" Prompt: {prompt}")
249
+ print(f" Solver: {solver} ({effective_steps} effective steps)")
250
+ print(f" Flowstep: {flowstep}")
251
+
252
+ # Call the MelodyFlow API
253
+ result = client.predict(
254
+ model="facebook/melodyflow-t24-30secs",
255
+ text=prompt,
256
+ solver=solver,
257
+ steps=base_steps,
258
+ target_flowstep=flowstep,
259
+ regularize=solver == "euler",
260
+ regularization_strength=0.2,
261
+ duration=30,
262
+ melody=handle_file(audio_path),
263
+ api_name="/predict"
264
+ )
265
+
266
+ if result and len(result) > 0 and result[0]:
267
+ # Save the result locally
268
+ output_filename = f"melodyflow_transformed_{random.randint(1000, 9999)}.wav"
269
+ import shutil
270
+ shutil.copy2(result[0], output_filename)
271
+
272
+ status_msg = f"βœ… Transformed with prompt: '{prompt}' (flowstep: {flowstep}, {effective_steps} steps)"
273
+ return output_filename, status_msg
274
+ else:
275
+ return None, "❌ MelodyFlow API returned no results"
276
+
277
+ except Exception as e:
278
+ return None, f"❌ MelodyFlow API error: {str(e)}"
279
+
280
+ def calculate_optimal_bars(bpm):
281
+ """Calculate optimal bar count for given BPM to fit in ~10s"""
282
+ seconds_per_beat = 60.0 / bpm
283
+ seconds_per_bar = seconds_per_beat * 4
284
+ max_duration = 10.0
285
+
286
+ for bars in [8, 4, 2, 1]:
287
+ if seconds_per_bar * bars <= max_duration:
288
+ return bars
289
+ return 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
 
291
+ # ========== GRADIO INTERFACE ==========
 
292
 
293
+ with gr.Blocks(title="🎡 Stable Audio Loop Generator") as iface:
294
+ gr.Markdown("# 🎡 Stable Audio Loop Generator")
295
+ gr.Markdown("**Generate synchronized drum and instrument loops with stable-audio-open-small, then transform with MelodyFlow!**")
296
+
297
+ with gr.Accordion("How This Works", open=False):
298
+ gr.Markdown("""
299
+ **Workflow:**
300
+ 1. **Set global BPM and bars** - affects both drum and instrument generation
301
+ 2. **Generate drum loop** - creates BPM-aware percussion
302
+ 3. **Generate instrument loop** - creates melodic/harmonic content
303
+ 4. **Combine loops** - layer them together with repetitions (up to 30s)
304
+ 5. **Transform** - use MelodyFlow to stylistically transform the combined result
305
+
306
+ **Features:**
307
+ - BPM-aware generation ensures perfect sync between loops
308
+ - Negative prompting separates drums from instruments cleanly
309
+ - Smart bar calculation optimizes loop length for the BPM
310
+ - MelodyFlow integration for advanced style transfer
311
+ """)
312
+
313
+ # ========== GLOBAL CONTROLS ==========
314
+ gr.Markdown("## πŸŽ›οΈ Global Settings")
315
+
316
+ with gr.Row():
317
+ global_bpm = gr.Dropdown(
318
+ label="Global BPM",
319
+ choices=[90, 100, 110, 120, 130, 140, 150],
320
+ value=120,
321
+ info="BPM applied to both drum and instrument generation"
322
+ )
323
+
324
+ global_bars = gr.Dropdown(
325
+ label="Loop Length (Bars)",
326
+ choices=[1, 2, 4, 8],
327
+ value=4,
328
+ info="Number of bars for each loop"
329
+ )
330
+
331
+ base_prompt = gr.Textbox(
332
+ label="Base Prompt",
333
+ value="techno",
334
+ placeholder="e.g., 'techno', 'jazz', 'ambient', 'hip-hop'",
335
+ info="Style applied to both loops"
336
+ )
337
+
338
+ # Auto-suggest optimal bars based on BPM
339
+ def update_suggested_bars(bpm):
340
+ optimal = calculate_optimal_bars(bpm)
341
+ return gr.update(info=f"Suggested: {optimal} bars for {bpm}bpm (≀10s)")
342
+
343
+ global_bpm.change(update_suggested_bars, inputs=[global_bpm], outputs=[global_bars])
344
+
345
+ # ========== LOOP GENERATION ==========
346
+ gr.Markdown("## πŸ₯ Step 1: Generate Individual Loops")
347
+
348
  with gr.Row():
349
  with gr.Column():
350
+ gr.Markdown("### πŸ₯ Drum Loop")
351
+ generate_drums_btn = gr.Button("Generate Drums", variant="primary", size="lg")
352
+ drums_audio = gr.Audio(label="Drum Loop", type="filepath")
353
+ drums_status = gr.Textbox(label="Drums Status", value="Ready to generate")
354
+
 
 
355
  with gr.Column():
356
+ gr.Markdown("### 🎹 Instrument Loop")
357
+ generate_instruments_btn = gr.Button("Generate Instruments", variant="secondary", size="lg")
358
+ instruments_audio = gr.Audio(label="Instrument Loop", type="filepath")
359
+ instruments_status = gr.Textbox(label="Instruments Status", value="Ready to generate")
360
+
361
+ # Seed controls
362
+ with gr.Row():
363
+ drums_seed = gr.Number(label="Drums Seed", value=-1, info="-1 for random")
364
+ instruments_seed = gr.Number(label="Instruments Seed", value=-1, info="-1 for random")
365
+
366
+ # ========== COMBINATION ==========
367
+ gr.Markdown("## πŸŽ›οΈ Step 2: Combine Loops")
368
+
369
+ with gr.Row():
370
+ num_repeats = gr.Slider(
371
+ label="Number of Repetitions",
372
+ minimum=1,
373
+ maximum=5,
374
+ step=1,
375
+ value=2,
376
+ info="How many times to repeat each loop (creates longer audio)"
377
+ )
378
+ combine_btn = gr.Button("πŸŽ›οΈ Combine Loops", variant="primary", size="lg")
379
+
380
+ combined_audio = gr.Audio(label="Combined Loops", type="filepath")
381
+ combine_status = gr.Textbox(label="Combine Status", value="Generate loops first")
382
+
383
+ # ========== MELODYFLOW TRANSFORMATION ==========
384
+ gr.Markdown("## 🎨 Step 3: Transform with MelodyFlow")
385
+
386
+ with gr.Row():
387
+ with gr.Column():
388
+ transform_prompt = gr.Textbox(
389
+ label="Transformation Prompt",
390
+ value="aggressive industrial techno with distorted sounds",
391
+ placeholder="Describe the style transformation",
392
+ lines=2
393
+ )
394
+
395
+ with gr.Column():
396
+ transform_solver = gr.Dropdown(
397
+ label="Solver",
398
+ choices=["euler", "midpoint"],
399
+ value="euler",
400
+ info="EULER: faster (25 steps), MIDPOINT: slower (64 steps)"
401
+ )
402
+ transform_flowstep = gr.Slider(
403
+ label="Transform Intensity",
404
+ minimum=0.0,
405
+ maximum=0.15,
406
+ step=0.01,
407
+ value=0.12,
408
+ info="Lower = more dramatic transformation"
409
+ )
410
+
411
+ transform_btn = gr.Button("🎨 Transform Audio", variant="secondary", size="lg")
412
+ transformed_audio = gr.Audio(label="Transformed Audio", type="filepath")
413
+ transform_status = gr.Textbox(label="Transform Status", value="Combine audio first")
414
+
415
+ # ========== EVENT HANDLERS ==========
416
+
417
+ # Generate drums
418
+ generate_drums_btn.click(
419
+ generate_stable_audio_loop,
420
+ inputs=[base_prompt, gr.State("drums"), global_bpm, global_bars, drums_seed],
421
+ outputs=[drums_audio, drums_status]
422
+ )
423
+
424
+ # Generate instruments
425
+ generate_instruments_btn.click(
426
+ generate_stable_audio_loop,
427
+ inputs=[base_prompt, gr.State("instruments"), global_bpm, global_bars, instruments_seed],
428
+ outputs=[instruments_audio, instruments_status]
429
+ )
430
+
431
+ # Combine loops
432
+ combine_btn.click(
433
+ combine_loops,
434
+ inputs=[drums_audio, instruments_audio, global_bpm, global_bars, num_repeats],
435
+ outputs=[combined_audio, combine_status]
436
+ )
437
+
438
+ # Transform with MelodyFlow
439
+ transform_btn.click(
440
+ transform_with_melodyflow_api,
441
+ inputs=[combined_audio, transform_prompt, transform_solver, transform_flowstep],
442
+ outputs=[transformed_audio, transform_status]
443
+ )
444
+
445
+ # ========== EXAMPLES ==========
446
+ gr.Markdown("## 🎯 Example Workflows")
447
+
448
+ examples = gr.Examples(
449
+ examples=[
450
+ ["techno", 128, 4, "aggressive industrial techno"],
451
+ ["jazz", 110, 2, "smooth lo-fi jazz with vinyl crackle"],
452
+ ["ambient", 90, 8, "ethereal ambient soundscape"],
453
+ ["hip-hop", 100, 4, "classic boom bap hip-hop"],
454
+ ["drum and bass", 140, 4, "liquid drum and bass"],
455
+ ],
456
+ inputs=[base_prompt, global_bpm, global_bars, transform_prompt],
457
+ )
458
 
459
+ if __name__ == "__main__":
460
+ iface.launch()
requirements.txt CHANGED
@@ -1,7 +1,22 @@
1
- torch==2.1.0
2
- torchaudio
3
- audiocraft
4
- musiclang_predict
5
- pyFluidSynth
6
- midi2audio
7
- spaces
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core ML dependencies
2
+ torch>=2.5.0
3
+ torchaudio>=2.5.0
4
+ einops
5
+ transformers
6
+ accelerate
7
+ diffusers
8
+ scipy
9
+ librosa
10
+ soundfile
11
+ huggingface-hub
12
+
13
+ # Gradio and HF Spaces
14
+ gradio
15
+ gradio-client
16
+ spaces
17
+
18
+ # Additional utilities
19
+ numpy
20
+
21
+ # Install stable-audio-tools from local submodule
22
+ -e ./stable-audio-tools
stable-audio-tools ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 9e5954dd60718373c90445ede390b02aa7119665