Spaces:
Running
on
Zero
Running
on
Zero
chong.zhang
commited on
Commit
·
b93c24a
1
Parent(s):
96fe5d9
update
Browse files- app.py +96 -254
- inspiremusic/cli/inference.py +4 -0
app.py
CHANGED
@@ -1,259 +1,101 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
import io
|
4 |
-
import numpy as np
|
5 |
-
import torchaudio
|
6 |
-
|
7 |
-
import torch
|
8 |
-
import soundfile as sf
|
9 |
import gradio as gr
|
10 |
-
import spaces
|
11 |
from inspiremusic.cli.inference import InspireMusicUnified, set_env_variables
|
12 |
-
import os
|
13 |
-
import sys
|
14 |
-
|
15 |
-
|
16 |
-
def get_args():
|
17 |
-
parser = argparse.ArgumentParser(
|
18 |
-
description='Run inference with your model')
|
19 |
-
parser.add_argument('-m', '--model_name', default="InspireMusic-1.5B-Long",
|
20 |
-
help='Model name')
|
21 |
-
|
22 |
-
parser.add_argument('-d', '--model_dir',
|
23 |
-
help='Model folder path')
|
24 |
-
|
25 |
-
parser.add_argument('-t', '--text',
|
26 |
-
default="Experience soothing and sensual instrumental jazz with a touch of Bossa Nova, perfect for a relaxing restaurant or spa ambiance.",
|
27 |
-
help='Prompt text')
|
28 |
-
|
29 |
-
parser.add_argument('-a', '--audio_prompt', default=None,
|
30 |
-
help='Prompt audio')
|
31 |
-
|
32 |
-
parser.add_argument('-c', '--chorus', default="intro",
|
33 |
-
help='Chorus tag generation mode (e.g., random, verse, chorus, intro, outro)')
|
34 |
-
|
35 |
-
parser.add_argument('--fast', type=bool, default=False,
|
36 |
-
help='Enable fast inference mode (without flow matching)')
|
37 |
-
|
38 |
-
parser.add_argument('-g', '--gpu', type=int, default=0,
|
39 |
-
help='GPU ID for this rank, -1 for CPU')
|
40 |
-
|
41 |
-
parser.add_argument('--task', default='text-to-music',
|
42 |
-
choices=['text-to-music', 'continuation', 'reconstruct', 'super_resolution'],
|
43 |
-
help='Inference task type: text-to-music, continuation, reconstruct, super_resolution')
|
44 |
-
|
45 |
-
parser.add_argument('-r', '--result_dir', default="exp/inspiremusic",
|
46 |
-
help='Directory to save generated audio')
|
47 |
-
|
48 |
-
parser.add_argument('-o', '--output_fn', default="output_audio",
|
49 |
-
help='Output file name')
|
50 |
-
|
51 |
-
parser.add_argument('-f', '--format', type=str, default="wav",
|
52 |
-
choices=["wav", "mp3", "m4a", "flac"],
|
53 |
-
help='Format of output audio')
|
54 |
-
|
55 |
-
parser.add_argument('--sample_rate', type=int, default=24000,
|
56 |
-
help='Sampling rate of input audio')
|
57 |
-
|
58 |
-
parser.add_argument('--output_sample_rate', type=int, default=48000,
|
59 |
-
choices=[24000, 48000],
|
60 |
-
help='Sampling rate of generated output audio')
|
61 |
-
|
62 |
-
parser.add_argument('-s', '--time_start', type=float, default=0.0,
|
63 |
-
help='Start time in seconds')
|
64 |
-
|
65 |
-
parser.add_argument('-e', '--time_end', type=float, default=30.0,
|
66 |
-
help='End time in seconds')
|
67 |
-
|
68 |
-
parser.add_argument('--max_audio_prompt_length', type=float, default=5.0,
|
69 |
-
help='Maximum audio prompt length in seconds')
|
70 |
-
|
71 |
-
parser.add_argument('--min_generate_audio_seconds', type=float,
|
72 |
-
default=10.0,
|
73 |
-
help='Minimum generated audio length in seconds')
|
74 |
-
|
75 |
-
parser.add_argument('--max_generate_audio_seconds', type=float,
|
76 |
-
default=30.0,
|
77 |
-
help='Maximum generated audio length in seconds')
|
78 |
-
|
79 |
-
parser.add_argument('--fp16', type=bool, default=True,
|
80 |
-
help='Inference with fp16 model')
|
81 |
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
""
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
args = get_args()
|
178 |
-
args.task = task
|
179 |
-
args.text = text if text
|
180 |
-
args.audio_prompt = audio if audio
|
181 |
-
generate_audio_path = InspireMusic(args)
|
182 |
-
return generate_audio_path
|
183 |
-
|
184 |
-
demo = gr.Blocks()
|
185 |
-
|
186 |
-
t2m_demo = gr.Interface(
|
187 |
-
fn=music_generation,
|
188 |
-
inputs = [
|
189 |
-
gr.Dropdown(["Text-To-Music"], value="text-to-music", multiselect=False, info="Choose a task."),
|
190 |
-
gr.Text(label="Input Text"),
|
191 |
-
],
|
192 |
-
outputs = [
|
193 |
-
gr.Audio(label="Generated Music", type="generated audio filepath"),
|
194 |
-
],
|
195 |
-
title = "<a href='https://github.com/FunAudioLLM/InspireMusic' target='_blank'>InspireMusic<a/>: A Unified Framework for Music, Song, Audio Generation.",
|
196 |
-
description = ("InspireMusic ([Github Repo](https://github.com/FunAudioLLM/InspireMusic)) is a fundamental AIGC toolkit and models designed for music, song, and audio generation using PyTorch."
|
197 |
-
"To try it, simply type text to generation music, or click one of the examples. "),
|
198 |
-
article = ("<p style='text-align: center'><a href='' target='_blank'>InspireMusic</a> </p>"
|
199 |
-
"<p style='text-align: center'><a href='https://openreview.net/forum?id=yBlVlS2Fd9' target='_blank'>WavTokenizer: an Efficient Acoustic Discrete Codec Tokenizer for Audio Language Modeling</a> </p>"),
|
200 |
-
examples = [
|
201 |
-
["example/inspiremusic/inspiremusic_01.wav", "24000 Hz"],
|
202 |
-
["example/ras/chorus/chorus_01.wav", "48000 Hz"],
|
203 |
-
],
|
204 |
-
cache_examples = True,
|
205 |
-
)
|
206 |
-
|
207 |
-
con_demo = gr.Interface(
|
208 |
-
fn=music_generation,
|
209 |
-
inputs = [
|
210 |
-
gr.Dropdown(["Music Continuation"], value="continuation", multiselect=False, info="Choose a task."),
|
211 |
-
gr.Text(label="Input Text"),
|
212 |
-
gr.Audio(label="Input Audio Prompt", type="audio prompt filepath"),
|
213 |
-
],
|
214 |
-
outputs = [
|
215 |
-
gr.Audio(label="Generated Music", type="generated audio filepath"),
|
216 |
-
],
|
217 |
-
title = "<a href='https://github.com/FunAudioLLM/InspireMusic' target='_blank'>InspireMusic<a/>: A Unified Framework for Music, Song, Audio Generation.",
|
218 |
-
description = ("InspireMusic ([Github Repo](https://github.com/FunAudioLLM/InspireMusic)) is a fundamental AIGC toolkit and models designed for music, song, and audio generation using PyTorch."
|
219 |
-
"To try it, simply type text to generation music, or click one of the examples. "),
|
220 |
-
article = ("<p style='text-align: center'><a href='' target='_blank'>InspireMusic</a> </p>"
|
221 |
-
"<p style='text-align: center'><a href='https://openreview.net/forum?id=yBlVlS2Fd9' target='_blank'>WavTokenizer: an Efficient Acoustic Discrete Codec Tokenizer for Audio Language Modeling</a> </p>"),
|
222 |
-
examples = [
|
223 |
-
["example/inspiremusic/inspiremusic_01.wav", "24000 Hz"],
|
224 |
-
["example/ras/chorus/chorus_01.wav", "48000 Hz"],
|
225 |
-
],
|
226 |
-
cache_examples = True,
|
227 |
-
)
|
228 |
-
|
229 |
-
con_demo = gr.Interface(
|
230 |
-
fn=music_generation,
|
231 |
-
inputs = [
|
232 |
-
gr.Dropdown(["Music Continuation"], value="continuation", multiselect=False, info="Choose a task."),
|
233 |
-
gr.Text(label="Input Text"),
|
234 |
-
gr.Audio(label="Input Audio Prompt", type="audio prompt filepath"),
|
235 |
-
],
|
236 |
-
outputs = [
|
237 |
-
gr.Audio(label="Generated Music", type="generated audio filepath"),
|
238 |
-
],
|
239 |
-
title = "<a href='https://github.com/FunAudioLLM/InspireMusic' target='_blank'>InspireMusic<a/>: A Unified Framework for Music, Song, Audio Generation.",
|
240 |
-
description = ("InspireMusic ([Github Repo](https://github.com/FunAudioLLM/InspireMusic)) is a fundamental AIGC toolkit and models designed for music, song, and audio generation using PyTorch."
|
241 |
-
"To try it, simply type text to generation music, or click one of the examples. "),
|
242 |
-
article = ("<p style='text-align: center'><a href='' target='_blank'>InspireMusic</a> </p>"
|
243 |
-
"<p style='text-align: center'><a href='https://openreview.net/forum?id=yBlVlS2Fd9' target='_blank'>WavTokenizer: an Efficient Acoustic Discrete Codec Tokenizer for Audio Language Modeling</a> </p>"),
|
244 |
-
examples = [
|
245 |
-
["example/inspiremusic/inspiremusic_01.wav", "24000 Hz"],
|
246 |
-
["example/ras/chorus/chorus_01.wav", "48000 Hz"],
|
247 |
-
],
|
248 |
-
cache_examples = True,
|
249 |
-
)
|
250 |
-
|
251 |
-
with demo:
|
252 |
-
gr.TabbedInterface([t2m_demo, con_demo,],
|
253 |
-
["Task 1: Text-to-Music",
|
254 |
-
"Task 2: Music Continuation"])
|
255 |
-
# gr.TabbedInterface([t2m_demo, con_demo, fast_demo], ["Task 1: Text-to-Music", "Task 2: Music Continuation", "Task 3: Without Flow Matching"])
|
256 |
|
257 |
demo.launch()
|
258 |
-
|
259 |
-
|
|
|
1 |
+
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import gradio as gr
|
|
|
3 |
from inspiremusic.cli.inference import InspireMusicUnified, set_env_variables
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
+
def get_args(
|
6 |
+
task, text="", audio=None, model_name="InspireMusic-1.5B-Long",
|
7 |
+
chorus="intro", fast=False, fade_out=True, trim=False,
|
8 |
+
output_sample_rate=48000, max_generate_audio_seconds=30.0):
|
9 |
+
# This function constructs the arguments required for InspireMusic
|
10 |
+
args = {
|
11 |
+
"task" : task,
|
12 |
+
"text" : text,
|
13 |
+
"audio_prompt" : audio,
|
14 |
+
"model_name" : model_name,
|
15 |
+
"chorus" : chorus,
|
16 |
+
"fast" : fast,
|
17 |
+
"fade_out" : fade_out,
|
18 |
+
"trim" : trim,
|
19 |
+
"output_sample_rate" : output_sample_rate,
|
20 |
+
"max_generate_audio_seconds": max_generate_audio_seconds,
|
21 |
+
"model_dir" : os.path.join("pretrained_models",
|
22 |
+
model_name),
|
23 |
+
"result_dir" : "exp/inspiremusic",
|
24 |
+
"output_fn" : "output_audio",
|
25 |
+
"format" : "wav",
|
26 |
+
}
|
27 |
+
return args
|
28 |
+
|
29 |
+
|
30 |
+
def music_generation(args):
|
31 |
+
set_env_variables()
|
32 |
+
model = InspireMusicUnified(
|
33 |
+
model_name=args["model_name"],
|
34 |
+
model_dir=args["model_dir"],
|
35 |
+
fast=args["fast"],
|
36 |
+
fade_out_mode=args["fade_out"],
|
37 |
+
trim=args["trim"],
|
38 |
+
output_sample_rate=args["output_sample_rate"],
|
39 |
+
max_generate_audio_seconds=args["max_generate_audio_seconds"]
|
40 |
+
)
|
41 |
+
output_path = model.inference(
|
42 |
+
task=args["task"],
|
43 |
+
text=args["text"],
|
44 |
+
audio_prompt=args["audio_prompt"],
|
45 |
+
chorus=args["chorus"],
|
46 |
+
output_fn=args["output_fn"],
|
47 |
+
output_format=args["format"],
|
48 |
+
time_start=0.0, time_end=30.0
|
49 |
+
)
|
50 |
+
return output_path
|
51 |
+
|
52 |
+
|
53 |
+
def run_inspiremusic(task, text, audio, model_name, chorus, fast, fade_out,
|
54 |
+
trim, output_sample_rate, max_generate_audio_seconds):
|
55 |
+
args = get_args(
|
56 |
+
task=task, text=text, audio=audio,
|
57 |
+
model_name=model_name, chorus=chorus, fast=fast,
|
58 |
+
fade_out=fade_out, trim=trim, output_sample_rate=output_sample_rate,
|
59 |
+
max_generate_audio_seconds=max_generate_audio_seconds)
|
60 |
+
return music_generation(args)
|
61 |
+
|
62 |
+
|
63 |
+
with gr.Blocks() as demo:
|
64 |
+
gr.Markdown("""
|
65 |
+
# InspireMusic:
|
66 |
+
Generate music using InspireMusic with various tasks such as "Text-to-Music" or "Music Continuation".
|
67 |
+
""")
|
68 |
+
|
69 |
+
with gr.Row():
|
70 |
+
task = gr.Radio(["text-to-music", "continuation"], label="Select Task",
|
71 |
+
value="text-to-music")
|
72 |
+
model_name = gr.Dropdown(["InspireMusic-1.5B-Long", "InspireMusic-1.5B", "InspireMusic-1.5B-24kHz", "InspireMusic-Base", "InspireMusic-Base-24kHz"], label="Model Name", value="InspireMusic-1.5B-Long")
|
73 |
+
|
74 |
+
text_input = gr.Textbox(label="Input Text (For Text-to-Music Task)")
|
75 |
+
audio_input = gr.Audio(label="Input Audio (For Music Continuation Task)",
|
76 |
+
source="upload", type="filepath")
|
77 |
+
|
78 |
+
with gr.Column():
|
79 |
+
chorus = gr.Dropdown(["verse", "chorus", "intro", "outro"],
|
80 |
+
label="Chorus Mode", value="intro")
|
81 |
+
fast = gr.Checkbox(label="Fast Inference", value=False)
|
82 |
+
fade_out = gr.Checkbox(label="Apply Fade Out", value=True)
|
83 |
+
trim = gr.Checkbox(label="Trim Silence", value=False)
|
84 |
+
output_sample_rate = gr.Dropdown([24000, 48000],
|
85 |
+
label="Output Sample Rate",
|
86 |
+
value=48000)
|
87 |
+
max_generate_audio_seconds = gr.Slider(10, 300,
|
88 |
+
label="Max Generated Audio Length (Seconds)",
|
89 |
+
value=30)
|
90 |
+
|
91 |
+
music_output = gr.Audio(label="Generated Music Result", type="filepath")
|
92 |
+
generate_button = gr.Button("Generate Music")
|
93 |
+
|
94 |
+
generate_button.click(run_inspiremusic,
|
95 |
+
inputs=[task, text_input, audio_input, model_name,
|
96 |
+
chorus, fast, fade_out, trim,
|
97 |
+
output_sample_rate,
|
98 |
+
max_generate_audio_seconds],
|
99 |
+
outputs=music_output)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
|
101 |
demo.launch()
|
|
|
|
inspiremusic/cli/inference.py
CHANGED
@@ -186,6 +186,10 @@ class InspireMusicUnified:
|
|
186 |
|
187 |
else:
|
188 |
logging.error(f"Generated audio length is shorter than minimum required audio length.")
|
|
|
|
|
|
|
|
|
189 |
|
190 |
def get_args():
|
191 |
parser = argparse.ArgumentParser(description='Run inference with your model')
|
|
|
186 |
|
187 |
else:
|
188 |
logging.error(f"Generated audio length is shorter than minimum required audio length.")
|
189 |
+
if music_fn
|
190 |
+
return music_fn
|
191 |
+
else:
|
192 |
+
return None
|
193 |
|
194 |
def get_args():
|
195 |
parser = argparse.ArgumentParser(description='Run inference with your model')
|