Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import spaces | |
import torch | |
import torchaudio | |
import io | |
import base64 | |
import uuid | |
import os | |
import time | |
import re | |
import threading | |
import gc | |
import random | |
import numpy as np | |
from einops import rearrange | |
from huggingface_hub import login | |
from stable_audio_tools import get_pretrained_model | |
from stable_audio_tools.inference.generation import generate_diffusion_cond | |
from gradio_client import Client, handle_file | |
from contextlib import contextmanager | |
# MelodyFlow Variations - extracted from variations.py | |
MELODYFLOW_VARIATIONS = { | |
# Acoustic Instruments | |
'accordion_folk': "Lively accordion music with a European folk feeling, perfect for a travel documentary about traditional culture and street performances in Paris", | |
'banjo_bluegrass': "Authentic bluegrass banjo band performance with rich picking patterns, ideal for a heartfelt documentary about American rural life and traditional crafts", | |
'piano_classical': "Expressive classical piano performance with dynamic range and emotional depth, ideal for a luxury brand commercial", | |
'celtic': "Traditional Celtic arrangement with fiddle and flute, perfect for a documentary about Ireland's stunning landscapes and ancient traditions", | |
'strings_quartet': "Elegant string quartet arrangement with rich harmonies and expressive dynamics, perfect for wedding ceremony music", | |
# Synthesizer Variations | |
'synth_retro': "1980s style synthesizer melody with warm analog pads and arpeggios, perfect for a nostalgic sci-fi movie soundtrack", | |
'synth_modern': "Modern electronic production with crisp digital synthesizer arpeggios and vocoder effects, ideal for a tech product launch video", | |
'synth_ambient': "Atmospheric synthesizer pads with reverb and delay, perfect for a meditation app or wellness commercial", | |
'synth_edm': "High-energy EDM synth saw leads with sidechain compression, pitch bends, perfect for sports highlights or action sequences", | |
# Band Arrangements | |
'rock_band': "Full rock band arrangement with electric guitars, bass, and drums, perfect for an action movie trailer", | |
# Hybrid/Special | |
'cinematic_epic': "Epic orchestral arrangement with modern hybrid elements, synthesizers, and percussion, perfect for movie trailers", | |
'lofi_chill': "Lo-fi hip hop style with vinyl crackle, mellow piano, and tape saturation, perfect for study or focus playlists", | |
'synth_bass': "Deep analog synthesizer bassline with modern production and subtle modulation, perfect for electronic music production", | |
'retro_rpg': "16-bit era JRPG soundtrack with bright melodic synthesizers, orchestral elements, and adventurous themes, perfect for a fantasy video game battle scene or overworld exploration", | |
'steel_drums': "Vibrant Caribbean steel drum ensemble with tropical percussion and uplifting melodies, perfect for a beach resort commercial or travel documentary", | |
'chiptune': "8-bit video game soundtrack with arpeggiated melodies and classic NES-style square waves, perfect for a retro platformer or action game", | |
'gamelan_fusion': "Indonesian gamelan ensemble with metallic percussion, gongs, and ethereal textures, perfect for a meditation app or spiritual documentary", | |
'music_box': "Delicate music box melody with gentle bell tones and ethereal ambiance, perfect for a children's lullaby or magical fantasy scene", | |
# Hip Hop / Trap Percussion | |
'trap_808': "808 bass", | |
'lo_fi_drums': "lofi hiphop percussion", | |
'boom_bap': "Classic 90s boom bap hip hop drums with punchy kicks, crisp snares, and jazz sample chops, perfect for documentary footage of urban street scenes and skateboarding", | |
'percussion_ensemble': "Rich percussive ensemble with djembe, congas, shakers, and tribal drums creating complex polyrhythms, perfect for nature documentaries about rainforests or ancient cultural rituals", | |
# Enhanced Electronic Music | |
'future_bass': "Energetic future bass with filtered supersaws, pitch-bending lead synths, heavy sidechain, and chopped vocal samples, perfect for extreme sports highlights or uplifting motivational content", | |
'synthwave_retro': "80s retrofuturistic synthwave with gated reverb drums, analog arpeggios, neon-bright lead synths and driving bass, perfect for cyberpunk-themed technology showcases or retro gaming montages", | |
'melodic_techno': "Hypnotic melodic techno with pulsing bass, atmospheric pads, and evolving synthesizer sequences with subtle filter modulation, ideal for timelapse footage of urban nightscapes or architectural showcases", | |
'dubstep_wobble': "Heavy dubstep with aggressive wobble bass, metallic synthesizers, distorted drops, and tension-building risers, perfect for action sequence transitions or gaming highlight reels", | |
# Glitchy Effects | |
'glitch_hop': "Glitch hop with stuttering sample slices, bit-crushed percussion, granular synthesis textures and digital artifacts, perfect for technology malfunction scenes or data visualization animations", | |
'digital_disruption': "Heavily glitched soundscape with digital artifacts, buffer errors, granular time stretching, and corrupted audio samples, ideal for cybersecurity themes or digital distortion transitions in tech presentations", | |
'circuit_bent': "Circuit-bent toy sounds with unpredictable pitch shifts, broken electronic tones, and hardware malfunction artifacts, perfect for creative coding demonstrations or innovative technology exhibitions", | |
# Experimental Hybrids | |
'orchestral_glitch': "Cinematic orchestral elements disrupted by digital glitches, granular textures, and temporal distortions, perfect for science fiction trailers or futuristic product reveals with contrasting classical and modern elements", | |
'vapor_drums': "Vaporwave drum processing with extreme pitch and time manipulation, reverb-drenched samples, and retro commercial music elements, ideal for nostalgic internet culture documentaries or retrofuturistic art installations", | |
'industrial_textures': "Harsh industrial soundscape with mechanical percussion, factory recordings, metallic impacts, and distorted synth drones, perfect for manufacturing process videos or dystopian urban environments", | |
'jungle_breaks': "High-energy jungle drum breaks with choppy breakbeat samples, deep sub bass, and dub reggae influences, perfect for fast-paced urban chase scenes or extreme sports montages" | |
} | |
# Global model storage | |
model_cache = {} | |
model_lock = threading.Lock() | |
def resource_cleanup(): | |
"""Lightweight context manager - let zerogpu handle memory management""" | |
try: | |
yield | |
finally: | |
# Minimal cleanup - let zerogpu handle the heavy lifting | |
if torch.cuda.is_available(): | |
torch.cuda.synchronize() | |
# Removed aggressive empty_cache() and gc.collect() calls | |
def load_stable_audio_model(): | |
"""Load stable-audio-open-small model if not already loaded.""" | |
with model_lock: | |
if 'stable_audio_model' not in model_cache: | |
print("🔄 Loading stable-audio-open-small model...") | |
load_start = time.time() | |
# Authenticate with HF | |
hf_token = os.getenv('HF_TOKEN') | |
if hf_token: | |
login(token=hf_token) | |
print(f"✅ HF authenticated") | |
# Load model | |
model, config = get_pretrained_model("stabilityai/stable-audio-open-small") | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model = model.to(device) | |
if device == "cuda": | |
model = model.half() | |
load_time = time.time() - load_start | |
print(f"✅ Model loaded on {device} in {load_time:.2f}s") | |
# Aggressive model persistence - warm up with dummy generation | |
print("🔥 Warming up model...") | |
warmup_start = time.time() | |
try: | |
dummy_conditioning = [{"prompt": "test", "seconds_total": 12}] | |
with torch.no_grad(): | |
_ = generate_diffusion_cond( | |
model, | |
steps=1, # Minimal steps for warmup | |
cfg_scale=1.0, | |
conditioning=dummy_conditioning, | |
sample_size=config["sample_size"], | |
sampler_type="pingpong", | |
device=device, | |
seed=42 | |
) | |
warmup_time = time.time() - warmup_start | |
print(f"🔥 Model warmed up in {warmup_time:.2f}s") | |
except Exception as e: | |
print(f"⚠️ Warmup failed (but continuing): {e}") | |
model_cache['stable_audio_model'] = model | |
model_cache['stable_audio_config'] = config | |
model_cache['stable_audio_device'] = device | |
print(f"✅ Stable Audio model ready for fast generation!") | |
else: | |
print("♻️ Using cached model (should be fast!)") | |
return (model_cache['stable_audio_model'], | |
model_cache['stable_audio_config'], | |
model_cache['stable_audio_device']) | |
def generate_stable_audio_loop(prompt, loop_type, bpm, bars, steps, cfg_scale, seed=-1): | |
"""Generate a BPM-aware loop using stable-audio-open-small""" | |
try: | |
total_start = time.time() | |
# Model loading timing | |
load_start = time.time() | |
model, config, device = load_stable_audio_model() | |
load_time = time.time() - load_start | |
# Calculate loop duration based on BPM and bars | |
seconds_per_beat = 60.0 / bpm | |
seconds_per_bar = seconds_per_beat * 4 # 4/4 time | |
target_loop_duration = seconds_per_bar * bars | |
# Enhance prompt based on loop type and BPM - minimal modification | |
if loop_type == "drums": | |
enhanced_prompt = f"{prompt} {bpm}bpm" | |
# Comprehensive negative prompting for drums - exclude all melodic/harmonic content | |
negative_prompt = "melody, harmony, pitched instruments, vocals, singing, piano, guitar, bass, synth, strings, horns, woodwinds, flute, saxophone, violin, cello, organ, keyboard, chords, notes, musical scale, tonal, melodic, harmonic" | |
else: # instruments | |
enhanced_prompt = f"{prompt} {bpm}bpm" | |
# Comprehensive negative prompting for instruments - exclude all percussive content | |
negative_prompt = "drums, percussion, kick, snare, hi-hat, cymbals, tom, drum kit, rhythm section, beats, drumming, percussive, drum machine, 808, trap drums, boom bap drums, breakbeat, drum breaks, kick drum, snare drum" | |
# Set seed | |
if seed == -1: | |
seed = random.randint(0, 2**32 - 1) | |
torch.manual_seed(seed) | |
if device == "cuda": | |
torch.cuda.manual_seed(seed) | |
print(f"🎵 Generating {loop_type} loop:") | |
print(f" Enhanced prompt: {enhanced_prompt}") | |
print(f" Target duration: {target_loop_duration:.2f}s ({bars} bars at {bpm}bpm)") | |
print(f" Steps: {steps}, CFG Scale: {cfg_scale}") | |
print(f" Seed: {seed}") | |
# Prepare conditioning | |
conditioning_start = time.time() | |
conditioning = [{ | |
"prompt": enhanced_prompt, | |
"seconds_total": 12 # Model generates 12s max | |
}] | |
negative_conditioning = [{ | |
"prompt": negative_prompt, | |
"seconds_total": 12 | |
}] | |
conditioning_time = time.time() - conditioning_start | |
# Generation timing | |
generation_start = time.time() | |
# Clear GPU cache once before generation (not after) | |
# if device == "cuda": | |
# torch.cuda.empty_cache() | |
with torch.cuda.amp.autocast(enabled=(device == "cuda")): | |
output = generate_diffusion_cond( | |
model, | |
steps=steps, # User-configurable steps | |
cfg_scale=cfg_scale, # User-configurable CFG scale | |
conditioning=conditioning, | |
negative_conditioning=negative_conditioning, | |
sample_size=config["sample_size"], | |
sampler_type="pingpong", | |
device=device, | |
seed=seed | |
) | |
generation_time = time.time() - generation_start | |
# Post-processing timing | |
postproc_start = time.time() | |
# Post-process audio | |
output = rearrange(output, "b d n -> d (b n)") # (2, N) stereo | |
output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1) | |
# Extract the loop portion | |
sample_rate = config["sample_rate"] | |
loop_samples = int(target_loop_duration * sample_rate) | |
available_samples = output.shape[1] | |
if loop_samples > available_samples: | |
loop_samples = available_samples | |
actual_duration = available_samples / sample_rate | |
print(f"⚠️ Requested {target_loop_duration:.2f}s, got {actual_duration:.2f}s") | |
# Extract loop from beginning (cleanest beat alignment) | |
loop_output = output[:, :loop_samples] | |
loop_output_int16 = loop_output.mul(32767).to(torch.int16).cpu() | |
# Save to temporary file | |
loop_filename = f"loop_{loop_type}_{bpm}bpm_{bars}bars_{seed}.wav" | |
torchaudio.save(loop_filename, loop_output_int16, sample_rate) | |
postproc_time = time.time() - postproc_start | |
total_time = time.time() - total_start | |
actual_duration = loop_samples / sample_rate | |
# Detailed timing breakdown | |
print(f"⏱️ Timing breakdown:") | |
print(f" Model load: {load_time:.2f}s") | |
print(f" Conditioning: {conditioning_time:.3f}s") | |
print(f" Generation: {generation_time:.2f}s") | |
print(f" Post-processing: {postproc_time:.3f}s") | |
print(f" Total: {total_time:.2f}s") | |
print(f"✅ {loop_type.title()} loop: {actual_duration:.2f}s audio in {total_time:.2f}s") | |
return loop_filename, f"Generated {actual_duration:.2f}s {loop_type} loop at {bpm}bpm ({bars} bars) in {total_time:.2f}s (steps: {steps}, cfg: {cfg_scale})" | |
except Exception as e: | |
print(f"❌ Generation error: {str(e)}") | |
return None, f"Error: {str(e)}" | |
def combine_loops(drums_audio, instruments_audio, bpm, bars, num_repeats): | |
"""Combine drum and instrument loops with specified repetitions""" | |
try: | |
if not drums_audio and not instruments_audio: | |
return None, "No audio files to combine" | |
# Calculate timing | |
seconds_per_beat = 60.0 / bpm | |
seconds_per_bar = seconds_per_beat * 4 | |
loop_duration = seconds_per_bar * bars | |
total_duration = loop_duration * num_repeats | |
print(f"🎛️ Combining loops:") | |
print(f" Loop duration: {loop_duration:.2f}s ({bars} bars)") | |
print(f" Repeats: {num_repeats}") | |
print(f" Total duration: {total_duration:.2f}s") | |
combined_audio = None | |
sample_rate = None | |
# Process each audio file | |
for audio_path, audio_type in [(drums_audio, "drums"), (instruments_audio, "instruments")]: | |
if audio_path: | |
# Load audio | |
waveform, sr = torchaudio.load(audio_path) | |
if sample_rate is None: | |
sample_rate = sr | |
# Ensure we have the exact loop duration | |
target_samples = int(loop_duration * sr) | |
if waveform.shape[1] > target_samples: | |
waveform = waveform[:, :target_samples] | |
elif waveform.shape[1] < target_samples: | |
# Pad if necessary | |
padding = target_samples - waveform.shape[1] | |
waveform = torch.cat([waveform, torch.zeros(waveform.shape[0], padding)], dim=1) | |
# Repeat the loop | |
repeated_waveform = waveform.repeat(1, num_repeats) | |
print(f" {audio_type}: {waveform.shape[1]/sr:.2f}s repeated {num_repeats}x = {repeated_waveform.shape[1]/sr:.2f}s") | |
# Add to combined audio | |
if combined_audio is None: | |
combined_audio = repeated_waveform | |
else: | |
combined_audio = combined_audio + repeated_waveform | |
if combined_audio is None: | |
return None, "No valid audio to combine" | |
# Normalize to prevent clipping | |
combined_audio = combined_audio / torch.max(torch.abs(combined_audio)) | |
combined_audio = combined_audio.clamp(-1, 1) | |
# Convert to int16 and save | |
combined_audio_int16 = combined_audio.mul(32767).to(torch.int16) | |
combined_filename = f"combined_{bpm}bpm_{bars}bars_{num_repeats}loops_{random.randint(1000, 9999)}.wav" | |
torchaudio.save(combined_filename, combined_audio_int16, sample_rate) | |
actual_duration = combined_audio.shape[1] / sample_rate | |
status = f"Combined into {actual_duration:.2f}s audio ({num_repeats} × {bars} bars at {bpm}bpm)" | |
print(f"✅ {status}") | |
return combined_filename, status | |
except Exception as e: | |
print(f"❌ Combine error: {str(e)}") | |
return None, f"Combine error: {str(e)}" | |
def transform_with_melodyflow_api(audio_path, prompt, solver="euler", flowstep=0.12): | |
"""Transform audio using Facebook/MelodyFlow space API""" | |
if audio_path is None: | |
return None, "❌ No audio file provided" | |
try: | |
# Initialize client for Facebook MelodyFlow space | |
client = Client("facebook/MelodyFlow") | |
# Set steps based on solver | |
if solver == "midpoint": | |
base_steps = 128 | |
effective_steps = base_steps // 2 # 64 effective steps | |
else: # euler | |
base_steps = 125 | |
effective_steps = base_steps // 5 # 25 effective steps | |
print(f"🎛️ MelodyFlow transformation:") | |
print(f" Prompt: {prompt}") | |
print(f" Solver: {solver} ({effective_steps} effective steps)") | |
print(f" Flowstep: {flowstep}") | |
# Call the MelodyFlow API | |
result = client.predict( | |
model="facebook/melodyflow-t24-30secs", | |
text=prompt, | |
solver=solver, | |
steps=base_steps, | |
target_flowstep=flowstep, | |
regularize=solver == "euler", | |
regularization_strength=0.2, | |
duration=30, | |
melody=handle_file(audio_path), | |
api_name="/predict" | |
) | |
if result and len(result) > 0 and result[0]: | |
# Save the result locally | |
output_filename = f"melodyflow_transformed_{random.randint(1000, 9999)}.wav" | |
import shutil | |
shutil.copy2(result[0], output_filename) | |
status_msg = f"✅ Transformed with prompt: '{prompt}' (flowstep: {flowstep}, {effective_steps} steps)" | |
return output_filename, status_msg | |
else: | |
return None, "❌ MelodyFlow API returned no results" | |
except Exception as e: | |
return None, f"❌ MelodyFlow API error: {str(e)}" | |
def extend_with_musicgen_api(audio_path, prompt_duration, musicgen_model, output_duration): | |
"""Extend audio using the micro-slot-machine space API""" | |
if audio_path is None: | |
return None, "❌ No audio file provided" | |
try: | |
# Initialize client for micro-slot-machine space | |
client = Client("thepatch/micro-slot-machine") | |
print(f"🎼 MusicGen extension:") | |
print(f" Prompt duration: {prompt_duration} (type: {type(prompt_duration)})") | |
print(f" Model: {musicgen_model}") | |
print(f" Output duration: {output_duration} (type: {type(output_duration)})") | |
# Call the continue_music API | |
result = client.predict( | |
input_audio_path=handle_file(audio_path), | |
prompt_duration=prompt_duration, # Integer from dropdown | |
musicgen_model=musicgen_model, | |
output_duration=float(output_duration), # Ensure it's a float | |
api_name="/continue_music" | |
) | |
if result: | |
# Save the result locally | |
output_filename = f"musicgen_extended_{random.randint(1000, 9999)}.wav" | |
import shutil | |
shutil.copy2(result, output_filename) | |
status_msg = f"✅ Extended with {musicgen_model} (prompt: {prompt_duration}s, output: {output_duration}s)" | |
return output_filename, status_msg | |
else: | |
return None, "❌ MusicGen API returned no results" | |
except Exception as e: | |
return None, f"❌ MusicGen API error: {str(e)}" | |
def calculate_optimal_bars(bpm): | |
"""Calculate optimal bar count for given BPM to fit in ~10s""" | |
seconds_per_beat = 60.0 / bpm | |
seconds_per_bar = seconds_per_beat * 4 | |
max_duration = 10.0 | |
for bars in [8, 4, 2, 1]: | |
if seconds_per_bar * bars <= max_duration: | |
return bars | |
return 1 | |
"""Calculate optimal bar count for given BPM to fit in ~10s""" | |
seconds_per_beat = 60.0 / bpm | |
seconds_per_bar = seconds_per_beat * 4 | |
max_duration = 10.0 | |
for bars in [8, 4, 2, 1]: | |
if seconds_per_bar * bars <= max_duration: | |
return bars | |
return 1 | |
def update_transform_prompt(variation_choice): | |
"""Update the transformation prompt based on variation selection""" | |
if variation_choice == "custom": | |
return gr.update(value="", placeholder="enter your custom transformation prompt", interactive=True) | |
elif variation_choice in MELODYFLOW_VARIATIONS: | |
return gr.update(value=MELODYFLOW_VARIATIONS[variation_choice], interactive=True) | |
else: | |
return gr.update(value="", placeholder="select a variation or enter custom prompt", interactive=True) | |
# ========== GRADIO INTERFACE ========== | |
with gr.Blocks(title="stable-melodyflow") as iface: | |
gr.Markdown("# stable-melodyflow (aka jerry and terry)") | |
gr.Markdown("**generate synchronized drum and instrument loops with stable-audio-open-small (jerry), then transform with melodyflow (terry)!**") | |
# ========== MODELS & PROJECT INFO ========== | |
with gr.Accordion(" some info about these models", open=False): | |
with gr.Accordion("🚀 stable-audio-open-small", open=False): | |
gr.Markdown(""" | |
**stable-audio-open-small** is an incredibly fast model from the zachs and friends at Stability AI. It's capable of generating 12 seconds of audio in under a second, which gives rise to a lot of very interesting kinds of UX. | |
**note about generation speed in this zerogpu space:** you'll notice generation times are a little slower here than if you were to use the model on a local gpu. that's just a result of the way zerogpu spaces work i think... let me know if there's a way to keep the model loaded in a zerogpu space! | |
**links:** | |
- 🤗 [model on HuggingFace](https://huggingface.co/stabilityai/stable-audio-open-small) | |
there's a docker container at this repo that can be spun up as a standalone api specifically for stable-audio-open-small: | |
- [stable-audio-api](https://github.com/betweentwomidnights/stable-audio-api) | |
""") | |
with gr.Accordion("🎛️ melodyflow", open=False): | |
gr.Markdown(""" | |
**MelodyFlow** is a model by meta that can use regularized latent inversion to do transformations of input audio. | |
It's not officially a part of the audiocraft repo yet, but we use it as a docker container in the backend for gary4live. i really enjoy turning my guitar riffs into orchestra. | |
**links:** | |
- 🤗 [Official MelodyFlow Space](https://huggingface.co/spaces/Facebook/MelodyFlow) | |
- [our melodyflow api](https://github.com/betweentwomidnights/melodyflow) | |
""") | |
with gr.Accordion("gary4live project", open=False): | |
gr.Markdown(""" | |
**gary4live** is a free/open source project that uses these models, along with musicGen, inside of ableton live to iterate on your projects with you. i run a backend myself so that we can all experiment with it, but you can also spin the backend up locally using docker-compose with our repo. | |
**project Links:** | |
- [frontend repo](https://github.com/betweentwomidnights/gary4live) | |
- [backend repo](https://github.com/betweentwomidnights/gary-backend-combined) | |
**installers:** | |
- [p.c. & mac installers on gumroad](https://thepatch.gumroad.com/l/gary4live) | |
""") | |
with gr.Accordion("how this works", open=False): | |
gr.Markdown(""" | |
**workflow:** | |
1. **set global bpm and bars** - affects both drum and instrument generation | |
2. **generate drum loop** - creates BPM-aware percussion with negative prompting to attempt to get rid of instruments | |
3. **generate instrument loop** - creates melodic/harmonic content with negative prompting to attempt to get rid of drums | |
4. **combine loops** - layer them together with repetitions (up to 30s) | |
5. **transform** - use melodyflow to stylistically transform the combined result | |
**features:** | |
- bpm-aware generation ensures perfect sync between loops (most the time lol) | |
- negative prompting separates drums from instruments (most the time) | |
- smart bar calculation optimizes loop length for the BPM | |
- preset transformation styles for braindead ease of use | |
""") | |
# ========== GLOBAL CONTROLS ========== | |
gr.Markdown("## 🎛️ global settings") | |
with gr.Row(): | |
global_bpm = gr.Dropdown( | |
label="global bpm", | |
choices=[90, 100, 110, 120, 130, 140, 150], | |
value=120, | |
info="bpm applied to both drum and instrument generation. keep this the same for the combine step to work correctly" | |
) | |
global_bars = gr.Dropdown( | |
label="loop length (bars)", | |
choices=[1, 2, 4], | |
value=4, | |
info="number of bars for each loop. keep this the same for both pieces of audio" | |
) | |
base_prompt = gr.Textbox( | |
label="base prompt", | |
value="lofi hiphop with pianos", | |
placeholder="e.g., 'aggressive techno', 'lofi hiphop', 'chillwave', 'liquid drum and bass'", | |
info="prompt applied to either loop. make it more drum/instrument specific for best results" | |
) | |
with gr.Row(): | |
generation_steps = gr.Slider( | |
label="generation steps", | |
minimum=4, | |
maximum=16, | |
step=1, | |
value=8, | |
info="more steps = higher quality but slower generation" | |
) | |
cfg_scale = gr.Slider( | |
label="cfg scale", | |
minimum=0.5, | |
maximum=2.0, | |
step=0.1, | |
value=1.0, | |
info="higher values = more prompt adherence but potentially less natural" | |
) | |
# Auto-suggest optimal bars based on BPM | |
def update_suggested_bars(bpm): | |
optimal = calculate_optimal_bars(bpm) | |
return gr.update(info=f"Suggested: {optimal} bars for {bpm}bpm (≤10s)") | |
global_bpm.change(update_suggested_bars, inputs=[global_bpm], outputs=[global_bars]) | |
# ========== LOOP GENERATION ========== | |
gr.Markdown("## step one: generate individual loops") | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("### drums") | |
generate_drums_btn = gr.Button("generate drums", variant="primary", size="lg") | |
drums_audio = gr.Audio(label="drum loop", type="filepath", show_download_button=True) | |
drums_status = gr.Textbox(label="status", value="ready to generate") | |
with gr.Column(): | |
gr.Markdown("### instruments") | |
generate_instruments_btn = gr.Button("generate instruments", variant="secondary", size="lg") | |
instruments_audio = gr.Audio(label="instrument loop", type="filepath", show_download_button=True) | |
instruments_status = gr.Textbox(label="status", value="ready to generate") | |
# Seed controls | |
with gr.Row(): | |
drums_seed = gr.Number(label="drums seed", value=-1, info="-1 for random") | |
instruments_seed = gr.Number(label="instruments seed", value=-1, info="-1 for random") | |
# ========== COMBINATION ========== | |
gr.Markdown("## step two: combine loops") | |
with gr.Row(): | |
num_repeats = gr.Slider( | |
label="number of repetitions", | |
minimum=1, | |
maximum=5, | |
step=1, | |
value=2, | |
info="how many times to repeat each loop (creates longer audio). aim for 30 seconds max" | |
) | |
combine_btn = gr.Button("combine", variant="primary", size="lg") | |
combined_audio = gr.Audio(label="combined loops", type="filepath", show_download_button=True) | |
combine_status = gr.Textbox(label="status", value="Generate loops first") | |
# ========== MELODYFLOW TRANSFORMATION ========== | |
gr.Markdown("## step three: transform with melodyflow") | |
with gr.Row(): | |
with gr.Column(): | |
# Variation dropdown | |
variation_choice = gr.Dropdown( | |
label="transformation style preset", | |
choices=["custom"] + list(MELODYFLOW_VARIATIONS.keys()), | |
value="custom", | |
info="select a preset style or choose 'custom' for your own prompt" | |
) | |
transform_prompt = gr.Textbox( | |
label="transformation prompt", | |
value="", | |
placeholder="enter your custom transformation prompt", | |
lines=3, | |
info="describes the style transformation to apply" | |
) | |
with gr.Column(): | |
transform_solver = gr.Dropdown( | |
label="solver", | |
choices=["euler", "midpoint"], | |
value="euler", | |
info="EULER: faster (25 steps), MIDPOINT: slower (64 steps)" | |
) | |
transform_flowstep = gr.Slider( | |
label="transform intensity", | |
minimum=0.0, | |
maximum=0.15, | |
step=0.01, | |
value=0.12, | |
info="Lower = more dramatic transformation" | |
) | |
transform_btn = gr.Button("transform audio", variant="secondary", size="lg") | |
transformed_audio = gr.Audio(label="transformed audio", type="filepath", show_download_button=True) | |
transform_status = gr.Textbox(label="status", value="Combine audio first") | |
# ========== MUSICGEN EXTENSION ========== | |
gr.Markdown("## step four (optional): extend with musicgen") | |
with gr.Row(): | |
with gr.Column(): | |
musicgen_prompt_duration = gr.Dropdown( | |
label="prompt duration (seconds)", | |
choices=[3, 5, 7, 10], # Back to integers since the function expects numbers | |
value=5, | |
info="how much of the end to use as prompt for continuation" | |
) | |
musicgen_output_duration = gr.Slider( | |
label="extension duration (seconds)", | |
minimum=10.0, | |
maximum=30.0, | |
step=1.0, | |
value=20.0, | |
info="how much new audio to generate" | |
) | |
with gr.Column(): | |
musicgen_model_choice = gr.Dropdown( | |
label="musicgen model", | |
choices=[ | |
"thepatch/vanya_ai_dnb_0.1 (small)", | |
"thepatch/bleeps-medium (medium)", | |
"thepatch/hoenn_lofi (large)" | |
], | |
value="thepatch/vanya_ai_dnb_0.1 (small)", | |
info="various musicgen fine-tunes for different styles" | |
) | |
extend_btn = gr.Button("extend with musicgen", variant="primary", size="lg") | |
extended_audio = gr.Audio(label="extended audio", type="filepath") | |
extend_status = gr.Textbox(label="status", value="Transform audio first") | |
# ========== EVENT HANDLERS ========== | |
# Update transform prompt when variation is selected | |
variation_choice.change( | |
update_transform_prompt, | |
inputs=[variation_choice], | |
outputs=[transform_prompt] | |
) | |
# Generate drums | |
generate_drums_btn.click( | |
generate_stable_audio_loop, | |
inputs=[base_prompt, gr.State("drums"), global_bpm, global_bars, generation_steps, cfg_scale, drums_seed], | |
outputs=[drums_audio, drums_status] | |
) | |
# Generate instruments | |
generate_instruments_btn.click( | |
generate_stable_audio_loop, | |
inputs=[base_prompt, gr.State("instruments"), global_bpm, global_bars, generation_steps, cfg_scale, instruments_seed], | |
outputs=[instruments_audio, instruments_status] | |
) | |
# Combine loops | |
combine_btn.click( | |
combine_loops, | |
inputs=[drums_audio, instruments_audio, global_bpm, global_bars, num_repeats], | |
outputs=[combined_audio, combine_status] | |
) | |
# Transform with MelodyFlow | |
transform_btn.click( | |
transform_with_melodyflow_api, | |
inputs=[combined_audio, transform_prompt, transform_solver, transform_flowstep], | |
outputs=[transformed_audio, transform_status] | |
) | |
# Extend with MusicGen | |
extend_btn.click( | |
extend_with_musicgen_api, | |
inputs=[transformed_audio, musicgen_prompt_duration, musicgen_model_choice, musicgen_output_duration], | |
outputs=[extended_audio, extend_status] | |
) | |
if __name__ == "__main__": | |
iface.launch() |