Spaces:

thepatch
/

the-slot-machine

Sleeping

App Files Files Community

the-slot-machine / app.py

thecollabagepatch

herewego

3e4e311 8 months ago

raw

history blame

6.84 kB

	import gradio as gr
	from musiclang_predict import MusicLangPredictor
	import random
	import subprocess
	import os
	import torchaudio
	import torch
	import numpy as np
	from audiocraft.models import MusicGen
	from audiocraft.data.audio import audio_write
	from pydub import AudioSegment

	# Utility Functions
	def peak_normalize(y, target_peak=0.97):
	return target_peak * (y / np.max(np.abs(y)))

	def rms_normalize(y, target_rms=0.05):
	return y * (target_rms / np.sqrt(np.mean(y**2)))

	def preprocess_audio(waveform):
	waveform_np = waveform.cpu().squeeze().numpy() # Move to CPU before converting to NumPy
	processed_waveform_np = rms_normalize(peak_normalize(waveform_np))
	return torch.from_numpy(processed_waveform_np).unsqueeze(0).to(device)

	def create_slices(song, sr, slice_duration, bpm, num_slices=5):
	song_length = song.shape[-1] / sr
	slices = []

	# Ensure the first slice is from the beginning of the song
	first_slice_waveform = song[..., :int(slice_duration * sr)]
	slices.append(first_slice_waveform)

	for i in range(1, num_slices):
	random_start = random.choice(range(int(slice_duration * sr), int(song_length * sr), int(4 * 60 / bpm * sr)))
	slice_end = random_start + int(slice_duration * sr)

	if slice_end > song_length * sr:
	# Wrap around to the beginning of the song
	remaining_samples = int(slice_end - song_length * sr)
	slice_waveform = torch.cat([song[..., random_start:], song[..., :remaining_samples]], dim=-1)
	else:
	slice_waveform = song[..., random_start:slice_end]

	if len(slice_waveform.squeeze()) < int(slice_duration * sr):
	additional_samples_needed = int(slice_duration * sr) - len(slice_waveform.squeeze())
	slice_waveform = torch.cat([slice_waveform, song[..., :additional_samples_needed]], dim=-1)

	slices.append(slice_waveform)

	return slices

	def calculate_duration(bpm, min_duration=29, max_duration=30):
	single_bar_duration = 4 * 60 / bpm
	bars = max(min_duration // single_bar_duration, 1)

	while single_bar_duration * bars < min_duration:
	bars += 1

	duration = single_bar_duration * bars

	while duration > max_duration and bars > 1:
	bars -= 1
	duration = single_bar_duration * bars

	return duration

	def generate_music(seed, use_chords, chord_progression, prompt_duration, musicgen_model, num_iterations, bpm):
	if seed == "":
	seed = random.randint(1, 10000)

	ml = MusicLangPredictor('musiclang/musiclang-v2')

	try:
	seed = int(seed)
	except ValueError:
	seed = random.randint(1, 10000)

	nb_tokens = 4096
	temperature = 0.9
	top_p = 1.0

	if use_chords and chord_progression.strip():
	score = ml.predict_chords(
	chord_progression,
	time_signature=(4, 4),
	temperature=temperature,
	topp=top_p,
	rng_seed=seed
	)
	else:
	score = ml.predict(
	nb_tokens=nb_tokens,
	temperature=temperature,
	topp=top_p,
	rng_seed=seed
	)

	midi_filename = f"output_{seed}.mid"
	wav_filename = midi_filename.replace(".mid", ".wav")

	score.to_midi(midi_filename, tempo=bpm, time_signature=(4, 4))

	subprocess.run(["fluidsynth", "-ni", "font.sf2", midi_filename, "-F", wav_filename, "-r", "44100"])

	# Load the generated audio
	song, sr = torchaudio.load(wav_filename)
	song = song.to(device)

	# Use the user-provided BPM value for duration calculation
	duration = calculate_duration(bpm)

	# Create slices from the song using the user-provided BPM value
	slices = create_slices(song, sr, 35, bpm, num_slices=5)

	# Load the model
	model_continue = MusicGen.get_pretrained(musicgen_model)

	# Setting generation parameters
	model_continue.set_generation_params(
	use_sampling=True,
	top_k=250,
	top_p=0.0,
	temperature=1.0,
	duration=duration,
	cfg_coef=3
	)

	all_audio_files = []

	for i in range(num_iterations):
	slice_idx = i % len(slices)

	print(f"Running iteration {i + 1} using slice {slice_idx}...")

	prompt_waveform = slices[slice_idx][..., :int(prompt_duration * sr)]
	prompt_waveform = preprocess_audio(prompt_waveform)

	output = model_continue.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True)
	output = output.cpu() # Move the output tensor back to CPU

	# Make sure the output tensor has at most 2 dimensions
	if len(output.size()) > 2:
	output = output.squeeze()

	filename_without_extension = f'continue_{i}'
	filename_with_extension = f'{filename_without_extension}.wav'

	audio_write(filename_with_extension, output, model_continue.sample_rate, strategy="loudness", loudness_compressor=True)
	all_audio_files.append(f'{filename_without_extension}.wav.wav') # Assuming the library appends an extra .wav

	# Combine all audio files
	combined_audio = AudioSegment.empty()
	for filename in all_audio_files:
	combined_audio += AudioSegment.from_wav(filename)

	combined_audio_filename = f"combined_audio_{seed}.mp3"
	combined_audio.export(combined_audio_filename, format="mp3")

	# Clean up temporary files
	os.remove(midi_filename)
	os.remove(wav_filename)
	for filename in all_audio_files:
	os.remove(filename)

	return combined_audio_filename

	# Check if CUDA is available
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	iface = gr.Interface(
	fn=generate_music,
	inputs=[
	gr.Textbox(label="Seed (leave blank for random)", value=""),
	gr.Checkbox(label="Control Chord Progression", value=False),
	gr.Textbox(label="Chord Progression (e.g., Am CM Dm E7 Am)", visible=True),
	gr.Dropdown(label="Prompt Duration (seconds)", choices=list(range(1, 11)), value=7),
	gr.Textbox(label="MusicGen Model", value="thepatch/vanya_ai_dnb_0.1"),
	gr.Slider(label="Number of Iterations", minimum=1, maximum=10, step=1, value=3),
	gr.Slider(label="BPM", minimum=60, maximum=200, step=1, value=140)
	],
	outputs=gr.Audio(label="Generated Music"),
	title="Music Generation Slot Machine",
	description="Enter a seed to generate music or leave blank for a random tune! Optionally, control the chord progression, prompt duration, MusicGen model, number of iterations, and BPM."
	)

	iface.launch()