Spaces:
Running
on
Zero
Running
on
Zero
Commit
Β·
c684cf6
1
Parent(s):
8ecf4fa
first attempt here
Browse files- .gitmodules +3 -3
- app.py +435 -319
- requirements.txt +22 -7
- stable-audio-tools +1 -0
.gitmodules
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
-
[submodule "
|
2 |
-
path =
|
3 |
-
url = https://github.com/
|
|
|
1 |
+
[submodule "stable-audio-tools"]
|
2 |
+
path = stable-audio-tools
|
3 |
+
url = https://github.com/Stability-AI/stable-audio-tools.git
|
app.py
CHANGED
@@ -1,344 +1,460 @@
|
|
1 |
import gradio as gr
|
2 |
import spaces
|
3 |
-
from musiclang_predict import MusicLangPredictor
|
4 |
-
import random
|
5 |
-
import subprocess
|
6 |
-
import os
|
7 |
-
import torchaudio
|
8 |
import torch
|
9 |
-
import
|
10 |
-
from audiocraft.models import MusicGen
|
11 |
-
from audiocraft.data.audio import audio_write
|
12 |
-
from pydub import AudioSegment
|
13 |
-
|
14 |
-
import tempfile
|
15 |
-
from pydub import AudioSegment
|
16 |
import io
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
return target_peak * (y / np.max(np.abs(y)))
|
24 |
-
|
25 |
-
def rms_normalize(y, target_rms=0.05):
|
26 |
-
return y * (target_rms / np.sqrt(np.mean(y**2)))
|
27 |
-
|
28 |
-
def preprocess_audio(waveform):
|
29 |
-
waveform_np = waveform.cpu().squeeze().numpy() # Move to CPU before converting to NumPy
|
30 |
-
# processed_waveform_np = rms_normalize(peak_normalize(waveform_np))
|
31 |
-
return torch.from_numpy(waveform_np).unsqueeze(0).to(device)
|
32 |
-
|
33 |
-
def create_slices(song, sr, slice_duration, bpm, num_slices=5):
|
34 |
-
song_length = song.shape[-1] / sr
|
35 |
-
slices = []
|
36 |
-
|
37 |
-
# Ensure the first slice is from the beginning of the song
|
38 |
-
first_slice_waveform = song[..., :int(slice_duration * sr)]
|
39 |
-
slices.append(first_slice_waveform)
|
40 |
-
|
41 |
-
for i in range(1, num_slices):
|
42 |
-
possible_start_indices = list(range(int(slice_duration * sr), int(song_length * sr), int(4 * 60 / bpm * sr)))
|
43 |
-
if not possible_start_indices:
|
44 |
-
# If there are no valid start indices, duplicate the first slice
|
45 |
-
slices.append(first_slice_waveform)
|
46 |
-
continue
|
47 |
|
48 |
-
|
49 |
-
|
|
|
|
|
50 |
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
else:
|
56 |
-
|
|
|
57 |
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
|
62 |
-
|
|
|
|
|
63 |
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
def generate_midi(seed, use_chords, chord_progression, bpm):
|
83 |
-
if seed == "":
|
84 |
-
seed = random.randint(1, 10000)
|
85 |
-
|
86 |
-
ml = MusicLangPredictor('musiclang/musiclang-v2')
|
87 |
|
|
|
|
|
88 |
try:
|
89 |
-
|
90 |
-
|
91 |
-
seed = random.randint(1, 10000)
|
92 |
-
|
93 |
-
nb_tokens = 1024
|
94 |
-
temperature = 0.9
|
95 |
-
top_p = 1.0
|
96 |
-
|
97 |
-
if use_chords and chord_progression.strip():
|
98 |
-
score = ml.predict_chords(
|
99 |
-
chord_progression,
|
100 |
-
time_signature=(4, 4),
|
101 |
-
temperature=temperature,
|
102 |
-
topp=top_p,
|
103 |
-
rng_seed=seed
|
104 |
-
)
|
105 |
-
else:
|
106 |
-
score = ml.predict(
|
107 |
-
nb_tokens=nb_tokens,
|
108 |
-
temperature=temperature,
|
109 |
-
topp=top_p,
|
110 |
-
rng_seed=seed
|
111 |
-
)
|
112 |
-
|
113 |
-
midi_filename = f"output_{seed}.mid"
|
114 |
-
wav_filename = midi_filename.replace(".mid", ".wav")
|
115 |
-
|
116 |
-
score.to_midi(midi_filename, tempo=bpm, time_signature=(4, 4))
|
117 |
-
|
118 |
-
subprocess.run(["fluidsynth", "-ni", "font.sf2", midi_filename, "-F", wav_filename, "-r", "44100"])
|
119 |
-
|
120 |
-
# Clean up temporary MIDI file
|
121 |
-
os.remove(midi_filename)
|
122 |
-
|
123 |
-
sample_rate = 44100 # Assuming fixed sample rate from fluidsynth command
|
124 |
-
return wav_filename
|
125 |
-
|
126 |
-
@spaces.GPU(duration=120)
|
127 |
-
def generate_music(wav_filename, prompt_duration, musicgen_model, num_iterations, bpm):
|
128 |
-
# Load the audio from the passed file path
|
129 |
-
song, sr = torchaudio.load(wav_filename)
|
130 |
-
song = song.to(device)
|
131 |
-
# Use the user-provided BPM value for duration calculation
|
132 |
-
duration = calculate_duration(bpm)
|
133 |
-
|
134 |
-
# Create slices from the song using the user-provided BPM value
|
135 |
-
slices = create_slices(song, sr, 35, bpm, num_slices=5)
|
136 |
-
|
137 |
-
# Load the model
|
138 |
-
model_name = musicgen_model.split(" ")[0]
|
139 |
-
model_continue = MusicGen.get_pretrained(model_name)
|
140 |
-
|
141 |
-
# Setting generation parameters
|
142 |
-
model_continue.set_generation_params(
|
143 |
-
use_sampling=True,
|
144 |
-
top_k=250,
|
145 |
-
top_p=0.0,
|
146 |
-
temperature=1.0,
|
147 |
-
duration=duration,
|
148 |
-
cfg_coef=3
|
149 |
-
)
|
150 |
-
|
151 |
-
all_audio_files = []
|
152 |
-
|
153 |
-
for i in range(num_iterations):
|
154 |
-
slice_idx = i % len(slices)
|
155 |
|
156 |
-
|
|
|
|
|
|
|
|
|
157 |
|
158 |
-
|
159 |
-
|
|
|
|
|
160 |
|
161 |
-
|
162 |
-
|
163 |
|
164 |
-
#
|
165 |
-
|
166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
|
168 |
-
|
169 |
-
|
170 |
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
os.remove(file_path)
|
251 |
-
|
252 |
-
return combined_audio_filename
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
# Define the expandable sections
|
257 |
-
musiclang_blurb = """
|
258 |
-
## musiclang
|
259 |
-
musiclang is a controllable ai midi model. it can generate midi sequences based on user-provided parameters, or unconditionally.
|
260 |
-
[<img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub" width="20" style="vertical-align:middle"> musiclang github](https://github.com/MusicLang/musiclang_predict)
|
261 |
-
[<img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" alt="Hugging Face" width="20" style="vertical-align:middle"> musiclang huggingface space](https://huggingface.co/spaces/musiclang/musiclang-predict)
|
262 |
-
"""
|
263 |
-
|
264 |
-
musicgen_blurb = """
|
265 |
-
## musicgen
|
266 |
-
musicgen is a transformer-based music model that generates audio. It can also do something called a continuation, which was initially meant to extend musicgen outputs beyond 30 seconds. it can be used with any input audio to produce surprising results.
|
267 |
-
[<img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub" width="20" style="vertical-align:middle"> audiocraft github](https://github.com/facebookresearch/audiocraft)
|
268 |
-
visit https://thecollabagepatch.com/infinitepolo.mp3 or https://thecollabagepatch.com/audiocraft.mp3 to hear continuations in action.
|
269 |
-
see also https://youtube.com/@thecollabagepatch
|
270 |
-
"""
|
271 |
-
|
272 |
-
finetunes_blurb = """
|
273 |
-
## fine-tuned models
|
274 |
-
the fine-tunes hosted on the huggingface hub are provided collectively by the musicgen discord community. thanks to vanya, mj, hoenn, septicDNB and of course, lyra.
|
275 |
-
[<img src="https://cdn.iconscout.com/icon/free/png-256/discord-3691244-3073764.png" alt="Discord" width="20" style="vertical-align:middle"> musicgen discord](https://discord.gg/93kX8rGZ)
|
276 |
-
[<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" style="vertical-align:middle"> fine-tuning colab notebook by lyra](https://colab.research.google.com/drive/13tbcC3A42KlaUZ21qvUXd25SFLu8WIvb)
|
277 |
-
"""
|
278 |
-
|
279 |
-
# Define the fine-tunes blurb for each model
|
280 |
-
fine_tunes_info = """
|
281 |
-
## thepatch/vanya_ai_dnb_0.1
|
282 |
-
thepatch/vanya_ai_dnb_0.1 was trained by vanya. [vanya's Twitter](https://twitter.com/@veryVANYA) π - it treats almost all input audio as the beginning of a buildup to a dnb drop (can do downtempo well)
|
283 |
-
|
284 |
-
## thepatch/bleeps-medium
|
285 |
-
thepatch/bleeps-medium was trained by kevin and lyra [lyra's Twitter](https://twitter.com/@_lyraaaa_) π - it is a medium model. it's more melodic and ambient sometimes than vanya's, but there's a 50/50 chance it gets real heavy with the edm vibes. It can be amazing at turning your chords into pads, and is a good percussionist.
|
286 |
-
|
287 |
-
## thepatch/budots_remix
|
288 |
-
thepatch/budots_remix was trained by MJ BERSABEph. budots is a dope niche genre from the philippines apparently. this one will often do fascinating, demonic, kinds of vocal chopping. warning: it tends to speed up and slow down tempo, which makes it hard to use in a daw.
|
289 |
-
|
290 |
-
## thepatch/hoenn_lofi
|
291 |
-
thepatch/hoenn_lofi is a large fine-tune by hoenn. [hoenn's Twitter](https://twitter.com/@eschatolocation) π - this model is a large boi, and it shows. even tho it is trained to do lo-fi, its ability to run with your melodies and not ruin them is unparalleled among the fine-tunes so far.
|
292 |
-
|
293 |
-
## thepatch/PhonkV2
|
294 |
-
thepatch/PhonkV2 was trained by MJ BERSABEph. there are multiple versions in the discord.
|
295 |
-
|
296 |
-
## foureyednymph/musicgen-sza-sos-small
|
297 |
-
foureyednymph/musicgen-sza-sos-small was just trained by foureyednymph. We're all about to find out if it does continuations well.
|
298 |
-
"""
|
299 |
-
|
300 |
-
# Create the Gradio interface
|
301 |
-
with gr.Blocks() as iface:
|
302 |
-
gr.Markdown("# the-slot-machine")
|
303 |
-
gr.Markdown("two ai's jamming. warning: outputs will be very strange, likely stupid, and possibly rad.")
|
304 |
-
gr.Markdown("this is a musical slot machine. using musiclang, we get a midi output. then, we let a musicgen model. trim it so that you like the beginning of the output, and choose the prompt duration. Then we give it to musicgen to continue for 30 seconds. We can then choose a new model and prompt duration, trim it, and give it to musicgen to continue from the end of the output. Re-upload, trim again and repeat with a new musicgen model and different prompt duration if you want. ")
|
305 |
-
|
306 |
-
with gr.Accordion("more info", open=False):
|
307 |
-
gr.Markdown(musiclang_blurb)
|
308 |
-
gr.Markdown(musicgen_blurb)
|
309 |
-
gr.Markdown(finetunes_blurb)
|
310 |
|
311 |
-
|
312 |
-
gr.Markdown(fine_tunes_info)
|
313 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
314 |
with gr.Row():
|
315 |
with gr.Column():
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
midi_audio = gr.Audio(label="Generated MIDI Audio", type="filepath") # Ensure this is set to handle file paths
|
322 |
-
|
323 |
with gr.Column():
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
343 |
|
344 |
-
|
|
|
|
1 |
import gradio as gr
|
2 |
import spaces
|
|
|
|
|
|
|
|
|
|
|
3 |
import torch
|
4 |
+
import torchaudio
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
import io
|
6 |
+
import base64
|
7 |
+
import uuid
|
8 |
+
import os
|
9 |
+
import time
|
10 |
+
import re
|
11 |
+
import threading
|
12 |
+
import gc
|
13 |
+
import random
|
14 |
+
import numpy as np
|
15 |
+
from einops import rearrange
|
16 |
+
from huggingface_hub import login
|
17 |
+
from stable_audio_tools import get_pretrained_model
|
18 |
+
from stable_audio_tools.inference.generation import generate_diffusion_cond
|
19 |
+
from gradio_client import Client, handle_file
|
20 |
+
from contextlib import contextmanager
|
21 |
+
|
22 |
+
# Global model storage
|
23 |
+
model_cache = {}
|
24 |
+
model_lock = threading.Lock()
|
25 |
+
|
26 |
+
@contextmanager
|
27 |
+
def resource_cleanup():
|
28 |
+
"""Context manager to ensure proper cleanup of GPU resources."""
|
29 |
+
try:
|
30 |
+
yield
|
31 |
+
finally:
|
32 |
+
if torch.cuda.is_available():
|
33 |
+
torch.cuda.synchronize()
|
34 |
+
torch.cuda.empty_cache()
|
35 |
+
gc.collect()
|
36 |
+
|
37 |
+
def load_stable_audio_model():
|
38 |
+
"""Load stable-audio-open-small model if not already loaded."""
|
39 |
+
with model_lock:
|
40 |
+
if 'stable_audio_model' not in model_cache:
|
41 |
+
print("π Loading stable-audio-open-small model...")
|
42 |
+
|
43 |
+
# Authenticate with HF
|
44 |
+
hf_token = os.getenv('HF_TOKEN')
|
45 |
+
if hf_token:
|
46 |
+
login(token=hf_token)
|
47 |
+
print(f"β
HF authenticated")
|
48 |
+
|
49 |
+
# Load model
|
50 |
+
model, config = get_pretrained_model("stabilityai/stable-audio-open-small")
|
51 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
52 |
+
model = model.to(device)
|
53 |
+
if device == "cuda":
|
54 |
+
model = model.half()
|
55 |
+
|
56 |
+
model_cache['stable_audio_model'] = model
|
57 |
+
model_cache['stable_audio_config'] = config
|
58 |
+
model_cache['stable_audio_device'] = device
|
59 |
+
print(f"β
Stable Audio model loaded on {device}")
|
60 |
+
|
61 |
+
return (model_cache['stable_audio_model'],
|
62 |
+
model_cache['stable_audio_config'],
|
63 |
+
model_cache['stable_audio_device'])
|
64 |
|
65 |
+
@spaces.GPU
|
66 |
+
def generate_stable_audio_loop(prompt, loop_type, bpm, bars, seed=-1):
|
67 |
+
"""Generate a BPM-aware loop using stable-audio-open-small"""
|
68 |
+
try:
|
69 |
+
model, config, device = load_stable_audio_model()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
+
# Calculate loop duration based on BPM and bars
|
72 |
+
seconds_per_beat = 60.0 / bpm
|
73 |
+
seconds_per_bar = seconds_per_beat * 4 # 4/4 time
|
74 |
+
target_loop_duration = seconds_per_bar * bars
|
75 |
|
76 |
+
# Enhance prompt based on loop type and BPM
|
77 |
+
if loop_type == "drums":
|
78 |
+
enhanced_prompt = f"{prompt} drum loop {bpm}bpm"
|
79 |
+
negative_prompt = "melody, harmony, pitched instruments, vocals, singing"
|
80 |
+
else: # instruments
|
81 |
+
enhanced_prompt = f"{prompt} instrumental loop {bpm}bpm"
|
82 |
+
negative_prompt = "drums, percussion, kick, snare, hi-hat"
|
83 |
|
84 |
+
# Set seed
|
85 |
+
if seed == -1:
|
86 |
+
seed = random.randint(0, 2**32 - 1)
|
87 |
|
88 |
+
torch.manual_seed(seed)
|
89 |
+
if device == "cuda":
|
90 |
+
torch.cuda.manual_seed(seed)
|
91 |
|
92 |
+
print(f"π΅ Generating {loop_type} loop:")
|
93 |
+
print(f" Enhanced prompt: {enhanced_prompt}")
|
94 |
+
print(f" Target duration: {target_loop_duration:.2f}s ({bars} bars at {bpm}bpm)")
|
95 |
+
print(f" Seed: {seed}")
|
96 |
+
|
97 |
+
# Prepare conditioning
|
98 |
+
conditioning = [{
|
99 |
+
"prompt": enhanced_prompt,
|
100 |
+
"seconds_total": 12 # Model generates 12s max
|
101 |
+
}]
|
102 |
+
|
103 |
+
negative_conditioning = [{
|
104 |
+
"prompt": negative_prompt,
|
105 |
+
"seconds_total": 12
|
106 |
+
}]
|
107 |
+
|
108 |
+
start_time = time.time()
|
109 |
+
|
110 |
+
with resource_cleanup():
|
111 |
+
if device == "cuda":
|
112 |
+
torch.cuda.empty_cache()
|
113 |
+
|
114 |
+
with torch.cuda.amp.autocast(enabled=(device == "cuda")):
|
115 |
+
output = generate_diffusion_cond(
|
116 |
+
model,
|
117 |
+
steps=8, # Fast generation
|
118 |
+
cfg_scale=1.0, # Good balance for loops
|
119 |
+
conditioning=conditioning,
|
120 |
+
negative_conditioning=negative_conditioning,
|
121 |
+
sample_size=config["sample_size"],
|
122 |
+
sampler_type="pingpong",
|
123 |
+
device=device,
|
124 |
+
seed=seed
|
125 |
+
)
|
126 |
+
|
127 |
+
generation_time = time.time() - start_time
|
128 |
+
|
129 |
+
# Post-process audio
|
130 |
+
output = rearrange(output, "b d n -> d (b n)") # (2, N) stereo
|
131 |
+
output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1)
|
132 |
+
|
133 |
+
# Extract the loop portion
|
134 |
+
sample_rate = config["sample_rate"]
|
135 |
+
loop_samples = int(target_loop_duration * sample_rate)
|
136 |
+
available_samples = output.shape[1]
|
137 |
+
|
138 |
+
if loop_samples > available_samples:
|
139 |
+
loop_samples = available_samples
|
140 |
+
actual_duration = available_samples / sample_rate
|
141 |
+
print(f"β οΈ Requested {target_loop_duration:.2f}s, got {actual_duration:.2f}s")
|
142 |
+
|
143 |
+
# Extract loop from beginning (cleanest beat alignment)
|
144 |
+
loop_output = output[:, :loop_samples]
|
145 |
+
loop_output_int16 = loop_output.mul(32767).to(torch.int16).cpu()
|
146 |
+
|
147 |
+
# Save to temporary file
|
148 |
+
loop_filename = f"loop_{loop_type}_{bpm}bpm_{bars}bars_{seed}.wav"
|
149 |
+
torchaudio.save(loop_filename, loop_output_int16, sample_rate)
|
150 |
+
|
151 |
+
actual_duration = loop_samples / sample_rate
|
152 |
+
print(f"β
{loop_type.title()} loop generated: {actual_duration:.2f}s in {generation_time:.2f}s")
|
153 |
+
|
154 |
+
return loop_filename, f"Generated {actual_duration:.2f}s {loop_type} loop at {bpm}bpm ({bars} bars)"
|
155 |
|
156 |
+
except Exception as e:
|
157 |
+
print(f"β Generation error: {str(e)}")
|
158 |
+
return None, f"Error: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
159 |
|
160 |
+
def combine_loops(drums_audio, instruments_audio, bpm, bars, num_repeats):
|
161 |
+
"""Combine drum and instrument loops with specified repetitions"""
|
162 |
try:
|
163 |
+
if not drums_audio and not instruments_audio:
|
164 |
+
return None, "No audio files to combine"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
|
166 |
+
# Calculate timing
|
167 |
+
seconds_per_beat = 60.0 / bpm
|
168 |
+
seconds_per_bar = seconds_per_beat * 4
|
169 |
+
loop_duration = seconds_per_bar * bars
|
170 |
+
total_duration = loop_duration * num_repeats
|
171 |
|
172 |
+
print(f"ποΈ Combining loops:")
|
173 |
+
print(f" Loop duration: {loop_duration:.2f}s ({bars} bars)")
|
174 |
+
print(f" Repeats: {num_repeats}")
|
175 |
+
print(f" Total duration: {total_duration:.2f}s")
|
176 |
|
177 |
+
combined_audio = None
|
178 |
+
sample_rate = None
|
179 |
|
180 |
+
# Process each audio file
|
181 |
+
for audio_path, audio_type in [(drums_audio, "drums"), (instruments_audio, "instruments")]:
|
182 |
+
if audio_path:
|
183 |
+
# Load audio
|
184 |
+
waveform, sr = torchaudio.load(audio_path)
|
185 |
+
if sample_rate is None:
|
186 |
+
sample_rate = sr
|
187 |
+
|
188 |
+
# Ensure we have the exact loop duration
|
189 |
+
target_samples = int(loop_duration * sr)
|
190 |
+
if waveform.shape[1] > target_samples:
|
191 |
+
waveform = waveform[:, :target_samples]
|
192 |
+
elif waveform.shape[1] < target_samples:
|
193 |
+
# Pad if necessary
|
194 |
+
padding = target_samples - waveform.shape[1]
|
195 |
+
waveform = torch.cat([waveform, torch.zeros(waveform.shape[0], padding)], dim=1)
|
196 |
+
|
197 |
+
# Repeat the loop
|
198 |
+
repeated_waveform = waveform.repeat(1, num_repeats)
|
199 |
+
|
200 |
+
print(f" {audio_type}: {waveform.shape[1]/sr:.2f}s repeated {num_repeats}x = {repeated_waveform.shape[1]/sr:.2f}s")
|
201 |
+
|
202 |
+
# Add to combined audio
|
203 |
+
if combined_audio is None:
|
204 |
+
combined_audio = repeated_waveform
|
205 |
+
else:
|
206 |
+
combined_audio = combined_audio + repeated_waveform
|
207 |
|
208 |
+
if combined_audio is None:
|
209 |
+
return None, "No valid audio to combine"
|
210 |
|
211 |
+
# Normalize to prevent clipping
|
212 |
+
combined_audio = combined_audio / torch.max(torch.abs(combined_audio))
|
213 |
+
combined_audio = combined_audio.clamp(-1, 1)
|
214 |
+
|
215 |
+
# Convert to int16 and save
|
216 |
+
combined_audio_int16 = combined_audio.mul(32767).to(torch.int16)
|
217 |
+
combined_filename = f"combined_{bpm}bpm_{bars}bars_{num_repeats}loops_{random.randint(1000, 9999)}.wav"
|
218 |
+
torchaudio.save(combined_filename, combined_audio_int16, sample_rate)
|
219 |
+
|
220 |
+
actual_duration = combined_audio.shape[1] / sample_rate
|
221 |
+
status = f"Combined into {actual_duration:.2f}s audio ({num_repeats} Γ {bars} bars at {bpm}bpm)"
|
222 |
+
|
223 |
+
print(f"β
{status}")
|
224 |
+
return combined_filename, status
|
225 |
+
|
226 |
+
except Exception as e:
|
227 |
+
print(f"β Combine error: {str(e)}")
|
228 |
+
return None, f"Combine error: {str(e)}"
|
229 |
+
|
230 |
+
def transform_with_melodyflow_api(audio_path, prompt, solver="euler", flowstep=0.12):
|
231 |
+
"""Transform audio using Facebook/MelodyFlow space API"""
|
232 |
+
if audio_path is None:
|
233 |
+
return None, "β No audio file provided"
|
234 |
+
|
235 |
+
try:
|
236 |
+
# Initialize client for Facebook MelodyFlow space
|
237 |
+
client = Client("facebook/MelodyFlow")
|
238 |
+
|
239 |
+
# Set steps based on solver
|
240 |
+
if solver == "midpoint":
|
241 |
+
base_steps = 128
|
242 |
+
effective_steps = base_steps // 2 # 64 effective steps
|
243 |
+
else: # euler
|
244 |
+
base_steps = 125
|
245 |
+
effective_steps = base_steps // 5 # 25 effective steps
|
246 |
+
|
247 |
+
print(f"ποΈ MelodyFlow transformation:")
|
248 |
+
print(f" Prompt: {prompt}")
|
249 |
+
print(f" Solver: {solver} ({effective_steps} effective steps)")
|
250 |
+
print(f" Flowstep: {flowstep}")
|
251 |
+
|
252 |
+
# Call the MelodyFlow API
|
253 |
+
result = client.predict(
|
254 |
+
model="facebook/melodyflow-t24-30secs",
|
255 |
+
text=prompt,
|
256 |
+
solver=solver,
|
257 |
+
steps=base_steps,
|
258 |
+
target_flowstep=flowstep,
|
259 |
+
regularize=solver == "euler",
|
260 |
+
regularization_strength=0.2,
|
261 |
+
duration=30,
|
262 |
+
melody=handle_file(audio_path),
|
263 |
+
api_name="/predict"
|
264 |
+
)
|
265 |
+
|
266 |
+
if result and len(result) > 0 and result[0]:
|
267 |
+
# Save the result locally
|
268 |
+
output_filename = f"melodyflow_transformed_{random.randint(1000, 9999)}.wav"
|
269 |
+
import shutil
|
270 |
+
shutil.copy2(result[0], output_filename)
|
271 |
+
|
272 |
+
status_msg = f"β
Transformed with prompt: '{prompt}' (flowstep: {flowstep}, {effective_steps} steps)"
|
273 |
+
return output_filename, status_msg
|
274 |
+
else:
|
275 |
+
return None, "β MelodyFlow API returned no results"
|
276 |
+
|
277 |
+
except Exception as e:
|
278 |
+
return None, f"β MelodyFlow API error: {str(e)}"
|
279 |
+
|
280 |
+
def calculate_optimal_bars(bpm):
|
281 |
+
"""Calculate optimal bar count for given BPM to fit in ~10s"""
|
282 |
+
seconds_per_beat = 60.0 / bpm
|
283 |
+
seconds_per_bar = seconds_per_beat * 4
|
284 |
+
max_duration = 10.0
|
285 |
+
|
286 |
+
for bars in [8, 4, 2, 1]:
|
287 |
+
if seconds_per_bar * bars <= max_duration:
|
288 |
+
return bars
|
289 |
+
return 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
290 |
|
291 |
+
# ========== GRADIO INTERFACE ==========
|
|
|
292 |
|
293 |
+
with gr.Blocks(title="π΅ Stable Audio Loop Generator") as iface:
|
294 |
+
gr.Markdown("# π΅ Stable Audio Loop Generator")
|
295 |
+
gr.Markdown("**Generate synchronized drum and instrument loops with stable-audio-open-small, then transform with MelodyFlow!**")
|
296 |
+
|
297 |
+
with gr.Accordion("How This Works", open=False):
|
298 |
+
gr.Markdown("""
|
299 |
+
**Workflow:**
|
300 |
+
1. **Set global BPM and bars** - affects both drum and instrument generation
|
301 |
+
2. **Generate drum loop** - creates BPM-aware percussion
|
302 |
+
3. **Generate instrument loop** - creates melodic/harmonic content
|
303 |
+
4. **Combine loops** - layer them together with repetitions (up to 30s)
|
304 |
+
5. **Transform** - use MelodyFlow to stylistically transform the combined result
|
305 |
+
|
306 |
+
**Features:**
|
307 |
+
- BPM-aware generation ensures perfect sync between loops
|
308 |
+
- Negative prompting separates drums from instruments cleanly
|
309 |
+
- Smart bar calculation optimizes loop length for the BPM
|
310 |
+
- MelodyFlow integration for advanced style transfer
|
311 |
+
""")
|
312 |
+
|
313 |
+
# ========== GLOBAL CONTROLS ==========
|
314 |
+
gr.Markdown("## ποΈ Global Settings")
|
315 |
+
|
316 |
+
with gr.Row():
|
317 |
+
global_bpm = gr.Dropdown(
|
318 |
+
label="Global BPM",
|
319 |
+
choices=[90, 100, 110, 120, 130, 140, 150],
|
320 |
+
value=120,
|
321 |
+
info="BPM applied to both drum and instrument generation"
|
322 |
+
)
|
323 |
+
|
324 |
+
global_bars = gr.Dropdown(
|
325 |
+
label="Loop Length (Bars)",
|
326 |
+
choices=[1, 2, 4, 8],
|
327 |
+
value=4,
|
328 |
+
info="Number of bars for each loop"
|
329 |
+
)
|
330 |
+
|
331 |
+
base_prompt = gr.Textbox(
|
332 |
+
label="Base Prompt",
|
333 |
+
value="techno",
|
334 |
+
placeholder="e.g., 'techno', 'jazz', 'ambient', 'hip-hop'",
|
335 |
+
info="Style applied to both loops"
|
336 |
+
)
|
337 |
+
|
338 |
+
# Auto-suggest optimal bars based on BPM
|
339 |
+
def update_suggested_bars(bpm):
|
340 |
+
optimal = calculate_optimal_bars(bpm)
|
341 |
+
return gr.update(info=f"Suggested: {optimal} bars for {bpm}bpm (β€10s)")
|
342 |
+
|
343 |
+
global_bpm.change(update_suggested_bars, inputs=[global_bpm], outputs=[global_bars])
|
344 |
+
|
345 |
+
# ========== LOOP GENERATION ==========
|
346 |
+
gr.Markdown("## π₯ Step 1: Generate Individual Loops")
|
347 |
+
|
348 |
with gr.Row():
|
349 |
with gr.Column():
|
350 |
+
gr.Markdown("### π₯ Drum Loop")
|
351 |
+
generate_drums_btn = gr.Button("Generate Drums", variant="primary", size="lg")
|
352 |
+
drums_audio = gr.Audio(label="Drum Loop", type="filepath")
|
353 |
+
drums_status = gr.Textbox(label="Drums Status", value="Ready to generate")
|
354 |
+
|
|
|
|
|
355 |
with gr.Column():
|
356 |
+
gr.Markdown("### πΉ Instrument Loop")
|
357 |
+
generate_instruments_btn = gr.Button("Generate Instruments", variant="secondary", size="lg")
|
358 |
+
instruments_audio = gr.Audio(label="Instrument Loop", type="filepath")
|
359 |
+
instruments_status = gr.Textbox(label="Instruments Status", value="Ready to generate")
|
360 |
+
|
361 |
+
# Seed controls
|
362 |
+
with gr.Row():
|
363 |
+
drums_seed = gr.Number(label="Drums Seed", value=-1, info="-1 for random")
|
364 |
+
instruments_seed = gr.Number(label="Instruments Seed", value=-1, info="-1 for random")
|
365 |
+
|
366 |
+
# ========== COMBINATION ==========
|
367 |
+
gr.Markdown("## ποΈ Step 2: Combine Loops")
|
368 |
+
|
369 |
+
with gr.Row():
|
370 |
+
num_repeats = gr.Slider(
|
371 |
+
label="Number of Repetitions",
|
372 |
+
minimum=1,
|
373 |
+
maximum=5,
|
374 |
+
step=1,
|
375 |
+
value=2,
|
376 |
+
info="How many times to repeat each loop (creates longer audio)"
|
377 |
+
)
|
378 |
+
combine_btn = gr.Button("ποΈ Combine Loops", variant="primary", size="lg")
|
379 |
+
|
380 |
+
combined_audio = gr.Audio(label="Combined Loops", type="filepath")
|
381 |
+
combine_status = gr.Textbox(label="Combine Status", value="Generate loops first")
|
382 |
+
|
383 |
+
# ========== MELODYFLOW TRANSFORMATION ==========
|
384 |
+
gr.Markdown("## π¨ Step 3: Transform with MelodyFlow")
|
385 |
+
|
386 |
+
with gr.Row():
|
387 |
+
with gr.Column():
|
388 |
+
transform_prompt = gr.Textbox(
|
389 |
+
label="Transformation Prompt",
|
390 |
+
value="aggressive industrial techno with distorted sounds",
|
391 |
+
placeholder="Describe the style transformation",
|
392 |
+
lines=2
|
393 |
+
)
|
394 |
+
|
395 |
+
with gr.Column():
|
396 |
+
transform_solver = gr.Dropdown(
|
397 |
+
label="Solver",
|
398 |
+
choices=["euler", "midpoint"],
|
399 |
+
value="euler",
|
400 |
+
info="EULER: faster (25 steps), MIDPOINT: slower (64 steps)"
|
401 |
+
)
|
402 |
+
transform_flowstep = gr.Slider(
|
403 |
+
label="Transform Intensity",
|
404 |
+
minimum=0.0,
|
405 |
+
maximum=0.15,
|
406 |
+
step=0.01,
|
407 |
+
value=0.12,
|
408 |
+
info="Lower = more dramatic transformation"
|
409 |
+
)
|
410 |
+
|
411 |
+
transform_btn = gr.Button("π¨ Transform Audio", variant="secondary", size="lg")
|
412 |
+
transformed_audio = gr.Audio(label="Transformed Audio", type="filepath")
|
413 |
+
transform_status = gr.Textbox(label="Transform Status", value="Combine audio first")
|
414 |
+
|
415 |
+
# ========== EVENT HANDLERS ==========
|
416 |
+
|
417 |
+
# Generate drums
|
418 |
+
generate_drums_btn.click(
|
419 |
+
generate_stable_audio_loop,
|
420 |
+
inputs=[base_prompt, gr.State("drums"), global_bpm, global_bars, drums_seed],
|
421 |
+
outputs=[drums_audio, drums_status]
|
422 |
+
)
|
423 |
+
|
424 |
+
# Generate instruments
|
425 |
+
generate_instruments_btn.click(
|
426 |
+
generate_stable_audio_loop,
|
427 |
+
inputs=[base_prompt, gr.State("instruments"), global_bpm, global_bars, instruments_seed],
|
428 |
+
outputs=[instruments_audio, instruments_status]
|
429 |
+
)
|
430 |
+
|
431 |
+
# Combine loops
|
432 |
+
combine_btn.click(
|
433 |
+
combine_loops,
|
434 |
+
inputs=[drums_audio, instruments_audio, global_bpm, global_bars, num_repeats],
|
435 |
+
outputs=[combined_audio, combine_status]
|
436 |
+
)
|
437 |
+
|
438 |
+
# Transform with MelodyFlow
|
439 |
+
transform_btn.click(
|
440 |
+
transform_with_melodyflow_api,
|
441 |
+
inputs=[combined_audio, transform_prompt, transform_solver, transform_flowstep],
|
442 |
+
outputs=[transformed_audio, transform_status]
|
443 |
+
)
|
444 |
+
|
445 |
+
# ========== EXAMPLES ==========
|
446 |
+
gr.Markdown("## π― Example Workflows")
|
447 |
+
|
448 |
+
examples = gr.Examples(
|
449 |
+
examples=[
|
450 |
+
["techno", 128, 4, "aggressive industrial techno"],
|
451 |
+
["jazz", 110, 2, "smooth lo-fi jazz with vinyl crackle"],
|
452 |
+
["ambient", 90, 8, "ethereal ambient soundscape"],
|
453 |
+
["hip-hop", 100, 4, "classic boom bap hip-hop"],
|
454 |
+
["drum and bass", 140, 4, "liquid drum and bass"],
|
455 |
+
],
|
456 |
+
inputs=[base_prompt, global_bpm, global_bars, transform_prompt],
|
457 |
+
)
|
458 |
|
459 |
+
if __name__ == "__main__":
|
460 |
+
iface.launch()
|
requirements.txt
CHANGED
@@ -1,7 +1,22 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Core ML dependencies
|
2 |
+
torch>=2.5.0
|
3 |
+
torchaudio>=2.5.0
|
4 |
+
einops
|
5 |
+
transformers
|
6 |
+
accelerate
|
7 |
+
diffusers
|
8 |
+
scipy
|
9 |
+
librosa
|
10 |
+
soundfile
|
11 |
+
huggingface-hub
|
12 |
+
|
13 |
+
# Gradio and HF Spaces
|
14 |
+
gradio
|
15 |
+
gradio-client
|
16 |
+
spaces
|
17 |
+
|
18 |
+
# Additional utilities
|
19 |
+
numpy
|
20 |
+
|
21 |
+
# Install stable-audio-tools from local submodule
|
22 |
+
-e ./stable-audio-tools
|
stable-audio-tools
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Subproject commit 9e5954dd60718373c90445ede390b02aa7119665
|