import os import sys import gradio as gr from inspiremusic.cli.inference import InspireMusicUnified, set_env_variables import torchaudio import datetime import hashlib def generate_filename(): now = datetime.datetime.now() seconds_since_epoch = int(now.timestamp()) # Convert seconds to string seconds_str = str(seconds_since_epoch) # Hash the string using SHA-256 hash_object = hashlib.sha256(seconds_str.encode()) hash_string = hash_object.hexdigest() return hash_string def get_args( task, text="", audio=None, model_name="InspireMusic-Base", chorus="intro", fast=False, fade_out=True, output_sample_rate=48000, max_generate_audio_seconds=30.0, time_start = 0.0, time_end=30.0, trim=False): # This function constructs the arguments required for InspireMusic args = { "task" : task, "text" : text, "audio_prompt" : audio, "model_name" : model_name, "chorus" : chorus, "fast" : fast, "fade_out" : fade_out, "trim" : trim, "output_sample_rate" : output_sample_rate, "min_generate_audio_seconds": 10.0, "max_generate_audio_seconds": max_generate_audio_seconds, "model_dir" : os.path.join("pretrained_models", model_name), "result_dir" : "exp/inspiremusic", "output_fn" : generate_filename(), "format" : "wav", "time_start" : time_start, "time_end": time_end, "fade_out_duration": 1.0, } if args["time_start"] is None: args["time_start"] = 0.0 args["time_end"] = args["time_start"] + args["max_generate_audio_seconds"] print(args) return args def music_generation(args): set_env_variables() model = InspireMusicUnified( model_name=args["model_name"], model_dir=args["model_dir"], min_generate_audio_seconds=args["min_generate_audio_seconds"], max_generate_audio_seconds=args["max_generate_audio_seconds"], sample_rate=24000, output_sample_rate=args["output_sample_rate"], load_jit=True, load_onnx=False, fast=args["fast"], result_dir=args["result_dir"]) output_path = model.inference( task=args["task"], text=args["text"], audio_prompt=args["audio_prompt"], chorus=args["chorus"], time_start=args["time_start"], time_end=args["time_end"], output_fn=args["output_fn"], max_audio_prompt_length=args["max_audio_prompt_length"], fade_out_duration=args["fade_out_duration"], output_format=args["format"], fade_out_mode=args["fade_out"], trim=args["trim"]) return output_path def update_text(): global text_input # Declare as global to modify the outer scope variable text_input = "New value set by button click" return text_input default_prompts = [ "Experience soothing and sensual instrumental jazz with a touch of Bossa Nova, perfect for a relaxing restaurant or spa ambiance.", "Compose an uplifting R&B song.", "Create an emotional, introspective folk song with acoustic guitar and soft vocals." ] def cut_audio(audio_file, cut_seconds=5): audio, sr = torchaudio.load(audio_file) num_samples = cut_seconds * sr cutted_audio = audio[:, :num_samples] output_path = os.path.join(os.getcwd(), "audio_prompt_" + generate_filename() + ".wav") torchaudio.save(output_path, cutted_audio, sr) return output_path def run_text2music(text, model_name, chorus, fast, fade_out, output_sample_rate, max_generate_audio_seconds): args = get_args( task='continuation', text=text, audio=None, model_name=model_name, chorus=chorus, fast=fast, fade_out=fade_out, output_sample_rate=output_sample_rate, max_generate_audio_seconds=max_generate_audio_seconds) return music_generation(args) def run_continuation(text, audio, model_name, chorus, fast, fade_out, output_sample_rate, max_generate_audio_seconds): args = get_args( task='text-to-music', text=text, audio=cut_audio(audio, cut_seconds=5), model_name=model_name, chorus=chorus, fast=fast, fade_out=fade_out, output_sample_rate=output_sample_rate, max_generate_audio_seconds=max_generate_audio_seconds) return music_generation(args) with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown(""" # InspireMusic - Support text-to-music, music continuation, audio super-resolution, audio reconstruction tasks with high audio quality, with available sampling rates of 24kHz, 48kHz. - Support long audio generation in multiple output audio formats, i.e., wav, flac, mp3, m4a. - Open-source [InspireMusic-Base](https://modelscope.cn/models/iic/InspireMusic/summary), [InspireMusic-Base-24kHz](https://modelscope.cn/models/iic/InspireMusic-Base-24kHz/summary), [InspireMusic-1.5B](https://modelscope.cn/models/iic/InspireMusic-1.5B/summary), [InspireMusic-1.5B-24kHz](https://modelscope.cn/models/iic/InspireMusic-1.5B-24kHz/summary), [InspireMusic-1.5B-Long](https://modelscope.cn/models/iic/InspireMusic-1.5B-Long/summary) models for music generation. - Currently only support English text prompts. """) with gr.Row(equal_height=True): model_name = gr.Dropdown(["InspireMusic-1.5B-Long", "InspireMusic-1.5B", "InspireMusic-1.5B-24kHz", "InspireMusic-Base", "InspireMusic-Base-24kHz"], label="Select Model Name", value="InspireMusic-Base") chorus = gr.Dropdown(["intro", "verse", "chorus", "outro"], label="Chorus Mode", value="intro") output_sample_rate = gr.Dropdown([48000, 24000], label="Output Audio Sample Rate (Hz)", value=48000) max_generate_audio_seconds = gr.Slider(10, 120, label="Generate Audio Length (s)", value=30) # with gr.Column(): # fast = gr.Checkbox(label="Fast Inference", value=False) # fade_out = gr.Checkbox(label="Apply Fade Out Effect", value=True) if output_sample_rate == 24000: fast = True else: fast = False fade_out = True with gr.Row(equal_height=True): # Textbox for custom input text_input = gr.Textbox(label="Input Text (For Text-to-Music Task)", value="Experience soothing and sensual instrumental jazz with a touch of Bossa Nova, perfect for a relaxing restaurant or spa ambiance.") audio_input = gr.Audio(label="Input Audio Prompt (For Music Continuation Task)", type="filepath") music_output = gr.Audio(label="Generated Music", type="filepath") with gr.Row(): button = gr.Button("Text to Music") button.click(run_text2music, inputs=[text_input, model_name, chorus, fast, fade_out, output_sample_rate, max_generate_audio_seconds], outputs=music_output) generate_button = gr.Button("Music Continuation") generate_button.click(run_continuation, inputs=[text_input, audio_input, model_name, chorus, fast, fade_out, output_sample_rate, max_generate_audio_seconds], outputs=music_output) with gr.Column(): default_prompt_buttons = [] for prompt in default_prompts: button = gr.Button(value=prompt) button.click(run_text2music, inputs=[text_input, model_name, chorus, fast, fade_out, output_sample_rate, max_generate_audio_seconds], outputs=music_output) default_prompt_buttons.append(button) demo.launch()