yozozaya commited on
Commit
5f9d5c0
·
1 Parent(s): be8283e

test as an API

Browse files
Files changed (2) hide show
  1. app.py +19 -502
  2. app_full.py +502 -0
app.py CHANGED
@@ -1,502 +1,19 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
-
4
- # This source code is licensed under the license found in the
5
- # LICENSE file in the root directory of this source tree.
6
-
7
- # Updated to account for UI changes from https://github.com/rkfg/audiocraft/blob/long/app.py
8
- # also released under the MIT license.
9
-
10
- import argparse
11
- from concurrent.futures import ProcessPoolExecutor
12
- import os
13
- from pathlib import Path
14
- import subprocess as sp
15
- from tempfile import NamedTemporaryFile
16
- import time
17
- import typing as tp
18
- import warnings
19
-
20
- import torch
21
- import gradio as gr
22
-
23
- from audiocraft.data.audio_utils import convert_audio
24
- from audiocraft.data.audio import audio_write
25
- from audiocraft.models import MusicGen, MultiBandDiffusion
26
-
27
-
28
- MODEL = None # Last used model
29
- IS_BATCHED = "facebook/MusicGen" in os.environ.get('SPACE_ID', '')
30
- print(IS_BATCHED)
31
- MAX_BATCH_SIZE = 12
32
- BATCHED_DURATION = 15
33
- INTERRUPTING = False
34
- MBD = None
35
- # We have to wrap subprocess call to clean a bit the log when using gr.make_waveform
36
- _old_call = sp.call
37
-
38
-
39
- def _call_nostderr(*args, **kwargs):
40
- # Avoid ffmpeg vomiting on the logs.
41
- kwargs['stderr'] = sp.DEVNULL
42
- kwargs['stdout'] = sp.DEVNULL
43
- _old_call(*args, **kwargs)
44
-
45
-
46
- sp.call = _call_nostderr
47
- # Preallocating the pool of processes.
48
- pool = ProcessPoolExecutor(4)
49
- pool.__enter__()
50
-
51
-
52
- def interrupt():
53
- global INTERRUPTING
54
- INTERRUPTING = True
55
-
56
-
57
- class FileCleaner:
58
- def __init__(self, file_lifetime: float = 3600):
59
- self.file_lifetime = file_lifetime
60
- self.files = []
61
-
62
- def add(self, path: tp.Union[str, Path]):
63
- self._cleanup()
64
- self.files.append((time.time(), Path(path)))
65
-
66
- def _cleanup(self):
67
- now = time.time()
68
- for time_added, path in list(self.files):
69
- if now - time_added > self.file_lifetime:
70
- if path.exists():
71
- path.unlink()
72
- self.files.pop(0)
73
- else:
74
- break
75
-
76
-
77
- file_cleaner = FileCleaner()
78
-
79
-
80
-
81
- def make_waveform(*args, **kwargs):
82
- # Further remove some warnings.
83
- be = time.time()
84
- with warnings.catch_warnings():
85
- warnings.simplefilter('ignore')
86
- out = gr.make_waveform(*args, **kwargs)
87
- print("Make a video took", time.time() - be)
88
- return out
89
-
90
-
91
-
92
- # write a similar function to make_waveform, but for video generated using an image with ans aspect ration of 16:9
93
-
94
- def make_video(*args, **kwargs):
95
- # Further remove some warnings.
96
- be = time.time()
97
- with warnings.catch_warnings():
98
- warnings.simplefilter('ignore')
99
- out = gr.make_video(*args, **kwargs)
100
- print("Make a video took", time.time() - be)
101
- return out
102
-
103
- # write make video functions for other aspect ratios and use ffmpeg to combine them into a single video
104
-
105
-
106
-
107
-
108
- # def load_model(version='facebook/musicgen-melody'):
109
- def load_model(version='facebook/musicgen-small'):
110
- global MODEL
111
- print("Loading model", version)
112
- if MODEL is None or MODEL.name != version:
113
- MODEL = MusicGen.get_pretrained(version)
114
-
115
-
116
- def load_diffusion():
117
- global MBD
118
- if MBD is None:
119
- print("loading MBD")
120
- MBD = MultiBandDiffusion.get_mbd_musicgen()
121
-
122
-
123
- def _do_predictions(texts, melodies, duration, progress=False, **gen_kwargs):
124
- MODEL.set_generation_params(duration=duration, **gen_kwargs)
125
- print("new batch", len(texts), texts, [
126
- None if m is None else (m[0], m[1].shape) for m in melodies])
127
- be = time.time()
128
- processed_melodies = []
129
- target_sr = 32000
130
- target_ac = 1
131
- for melody in melodies:
132
- if melody is None:
133
- processed_melodies.append(None)
134
- else:
135
- sr, melody = melody[0], torch.from_numpy(
136
- melody[1]).to(MODEL.device).float().t()
137
- if melody.dim() == 1:
138
- melody = melody[None]
139
- melody = melody[..., :int(sr * duration)]
140
- melody = convert_audio(melody, sr, target_sr, target_ac)
141
- processed_melodies.append(melody)
142
-
143
- if any(m is not None for m in processed_melodies):
144
- outputs = MODEL.generate_with_chroma(
145
- descriptions=texts,
146
- melody_wavs=processed_melodies,
147
- melody_sample_rate=target_sr,
148
- progress=progress,
149
- return_tokens=USE_DIFFUSION
150
- )
151
- else:
152
- outputs = MODEL.generate(
153
- texts, progress=progress, return_tokens=USE_DIFFUSION)
154
- if USE_DIFFUSION:
155
- outputs_diffusion = MBD.tokens_to_wav(outputs[1])
156
- outputs = torch.cat([outputs[0], outputs_diffusion], dim=0)
157
- outputs = outputs.detach().cpu().float()
158
- # return outputs
159
- pending_videos = []
160
- out_wavs = []
161
- for output in outputs:
162
- with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
163
- audio_write(
164
- file.name, output, MODEL.sample_rate, strategy="loudness",
165
- loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
166
- pending_videos.append(pool.submit(make_waveform, file.name))
167
- # pending_videos.append(pool.submit(make_video, file.name))
168
- out_wavs.append(file.name)
169
- file_cleaner.add(file.name)
170
- out_videos = [pending_video.result() for pending_video in pending_videos]
171
- for video in out_videos:
172
- file_cleaner.add(video)
173
- print("batch finished", len(texts), time.time() - be)
174
- print("Tempfiles currently stored: ", len(file_cleaner.files))
175
- # here I could ipload this to youtube music
176
- # return out_wavs
177
- return out_videos, out_wavs
178
-
179
-
180
- def predict_batched(texts, melodies):
181
- max_text_length = 512
182
- texts = [text[:max_text_length] for text in texts]
183
- load_model('facebook/musicgen-small')
184
- res = _do_predictions(texts, melodies, BATCHED_DURATION)
185
- return res
186
-
187
-
188
- def predict_full(model, decoder, text, melody, duration, topk, topp, temperature, cfg_coef, progress=gr.Progress()):
189
- global INTERRUPTING
190
- global USE_DIFFUSION
191
- INTERRUPTING = False
192
- if temperature < 0:
193
- raise gr.Error("Temperature must be >= 0.")
194
- if topk < 0:
195
- raise gr.Error("Topk must be non-negative.")
196
- if topp < 0:
197
- raise gr.Error("Topp must be non-negative.")
198
-
199
- topk = int(topk)
200
- if decoder == "MultiBand_Diffusion":
201
- USE_DIFFUSION = True
202
- load_diffusion()
203
- else:
204
- USE_DIFFUSION = False
205
- load_model(model)
206
-
207
- def _progress(generated, to_generate):
208
- progress((min(generated, to_generate), to_generate))
209
- if INTERRUPTING:
210
- raise gr.Error("Interrupted.")
211
- MODEL.set_custom_progress_callback(_progress)
212
-
213
- videos, wavs = _do_predictions(
214
- [text], [melody], duration, progress=True,
215
- top_k=topk, top_p=topp, temperature=temperature, cfg_coef=cfg_coef)
216
- if USE_DIFFUSION:
217
- return videos[0], wavs[0], videos[1], wavs[1]
218
- return videos[0], wavs[0], None, None
219
-
220
-
221
- def toggle_audio_src(choice):
222
- if choice == "mic":
223
- return gr.update(source="microphone", value=None, label="Microphone")
224
- else:
225
- return gr.update(source="upload", value=None, label="File")
226
-
227
-
228
- def toggle_diffusion(choice):
229
- if choice == "MultiBand_Diffusion":
230
- return [gr.update(visible=True)] * 2
231
- else:
232
- return [gr.update(visible=False)] * 2
233
-
234
-
235
- def ui_full(launch_kwargs):
236
- with gr.Blocks() as interface:
237
- gr.Markdown(
238
- """
239
- # MusicGen
240
- This is your private demo for [MusicGen](https://github.com/facebookresearch/audiocraft),
241
- a simple and controllable model for music generation
242
- presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284)
243
- """
244
- )
245
- with gr.Row():
246
- with gr.Column():
247
- with gr.Row():
248
- text = gr.Text(
249
- label="Input Text", value="Chill and relaxing downtempo for the shower", interactive=True)
250
- with gr.Column():
251
- radio = gr.Radio(["file", "mic"], value="file",
252
- label="Condition on a melody (optional) File or Mic")
253
- melody = gr.Audio(source="upload", type="numpy", label="File",
254
- interactive=True, elem_id="melody-input")
255
- with gr.Row():
256
- submit = gr.Button("Submit")
257
- # Adapted from https://github.com/rkfg/audiocraft/blob/long/app.py, MIT license.
258
- _ = gr.Button("Interrupt").click(fn=interrupt, queue=False)
259
- with gr.Row():
260
- model = gr.Radio(["facebook/musicgen-melody", "facebook/musicgen-medium", "facebook/musicgen-small",
261
- "facebook/musicgen-large"],
262
- label="Model", value="facebook/musicgen-small", interactive=True)
263
- with gr.Row():
264
- decoder = gr.Radio(["Default", "MultiBand_Diffusion"],
265
- label="Decoder", value="Default", interactive=True)
266
- with gr.Row():
267
- duration = gr.Slider(
268
- minimum=1, maximum=120, value=20, label="Duration", interactive=True)
269
- with gr.Row():
270
- topk = gr.Number(label="Top-k", value=250,
271
- interactive=True)
272
- topp = gr.Number(label="Top-p", value=0, interactive=True)
273
- temperature = gr.Number(
274
- label="Temperature", value=1.0, interactive=True)
275
- cfg_coef = gr.Number(
276
- label="Classifier Free Guidance", value=3.0, interactive=True)
277
- with gr.Column():
278
- output = gr.Video(label="Generated Music")
279
- audio_output = gr.Audio(
280
- label="Generated Music (wav)", type='filepath')
281
- diffusion_output = gr.Video(
282
- label="MultiBand Diffusion Decoder")
283
- audio_diffusion = gr.Audio(
284
- label="MultiBand Diffusion Decoder (wav)", type='filepath')
285
- submit.click(toggle_diffusion, decoder, [diffusion_output, audio_diffusion], queue=False,
286
- show_progress=False, api_name="generate").then(predict_full, inputs=[model, decoder, text, melody, duration, topk, topp,
287
- temperature, cfg_coef],
288
- outputs=[output, audio_output, diffusion_output, audio_diffusion])
289
- radio.change(toggle_audio_src, radio, [
290
- melody], queue=False, show_progress=False)
291
-
292
- gr.Examples(
293
- fn=predict_full,
294
- examples=[
295
- [
296
- "An 80s driving pop song with heavy drums and synth pads in the background",
297
- "./assets/bach.mp3",
298
- "facebook/musicgen-melody",
299
- "Default"
300
- ],
301
- [
302
- "90s rock song with electric guitar and heavy drums",
303
- None,
304
- "facebook/musicgen-medium",
305
- "Default"
306
- ],
307
- [
308
- "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions",
309
- "./assets/bach.mp3",
310
- "facebook/musicgen-melody",
311
- "Default"
312
- ],
313
- [
314
- "lofi slow bpm electro chill with organic samples",
315
- None,
316
- "facebook/musicgen-medium",
317
- "Default"
318
- ],
319
- [
320
- "Punk rock with loud drum and power guitar",
321
- None,
322
- "facebook/musicgen-medium",
323
- "MultiBand_Diffusion"
324
- ],
325
- ],
326
- inputs=[text, melody, model, decoder],
327
- outputs=[output]
328
- )
329
- gr.Markdown(
330
- """
331
- ### More details
332
-
333
- The model will generate a short music extract based on the description you provided.
334
- The model can generate up to 30 seconds of audio in one pass. It is now possible
335
- to extend the generation by feeding back the end of the previous chunk of audio.
336
- This can take a long time, and the model might lose consistency. The model might also
337
- decide at arbitrary positions that the song ends.
338
-
339
- **WARNING:** Choosing long durations will take a long time to generate (2min might take ~10min).
340
- An overlap of 12 seconds is kept with the previously generated chunk, and 18 "new" seconds
341
- are generated each time.
342
-
343
- We present 4 model variations:
344
- 1. facebook/musicgen-melody -- a music generation model capable of generating music condition
345
- on text and melody inputs. **Note**, you can also use text only.
346
- 2. facebook/musicgen-small -- a 300M transformer decoder conditioned on text only.
347
- 3. facebook/musicgen-medium -- a 1.5B transformer decoder conditioned on text only.
348
- 4. facebook/musicgen-large -- a 3.3B transformer decoder conditioned on text only.
349
-
350
- We also present two way of decoding the audio tokens
351
- 1. Use the default GAN based compression model
352
- 2. Use MultiBand Diffusion from (paper linknano )
353
-
354
- When using `facebook/musicgen-melody`, you can optionally provide a reference audio from
355
- which a broad melody will be extracted. The model will then try to follow both
356
- the description and melody provided.
357
-
358
- You can also use your own GPU or a Google Colab by following the instructions on our repo.
359
- See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
360
- for more details.
361
- """
362
- )
363
-
364
- interface.queue().launch(**launch_kwargs)
365
- # interface.queue().launch(**launch_kwargs, share=True)
366
-
367
-
368
- def ui_batched(launch_kwargs):
369
- with gr.Blocks() as demo:
370
- gr.Markdown(
371
- """
372
- # MusicGen
373
-
374
- This is the demo for [MusicGen](https://github.com/facebookresearch/audiocraft),
375
- a simple and controllable model for music generation
376
- presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284).
377
- <br/>
378
- <a href="https://huggingface.co/spaces/facebook/MusicGen?duplicate=true"
379
- style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
380
- <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;"
381
- src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
382
- for longer sequences, more control and no queue.</p>
383
- """
384
- )
385
- with gr.Row():
386
- with gr.Column():
387
- with gr.Row():
388
- text = gr.Text(label="Describe your music",
389
- value="Chill and relaxing downtempo for the shower", lines=2, interactive=True)
390
- with gr.Column():
391
- radio = gr.Radio(["file", "mic"], value="file",
392
- label="Condition on a melody (optional) File or Mic")
393
- melody = gr.Audio(source="upload", type="numpy", label="File",
394
- interactive=True, elem_id="melody-input")
395
- with gr.Row():
396
- submit = gr.Button("Generate")
397
- with gr.Column():
398
- output = gr.Video(label="Generated Music")
399
- audio_output = gr.Audio(
400
- label="Generated Music (wav)", type='filepath')
401
- submit.click(predict_batched, inputs=[text, melody],
402
- outputs=[output, audio_output], batch=True, max_batch_size=MAX_BATCH_SIZE, api_name="create")
403
- radio.change(toggle_audio_src, radio, [
404
- melody], queue=False, show_progress=False)
405
- # gr.Examples(
406
- # fn=predict_batched,
407
- # examples=[
408
- # [
409
- # "An 80s driving pop song with heavy drums and synth pads in the background",
410
- # "./assets/bach.mp3",
411
- # ],
412
- # [
413
- # "A cheerful country song with acoustic guitars",
414
- # "./assets/bolero_ravel.mp3",
415
- # ],
416
- # [
417
- # "90s rock song with electric guitar and heavy drums",
418
- # None,
419
- # ],
420
- # [
421
- # "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 130",
422
- # "./assets/bach.mp3",
423
- # ],
424
- # [
425
- # "lofi slow bpm electro chill with organic samples",
426
- # None,
427
- # ],
428
- # ],
429
- # inputs=[text, melody],
430
- # outputs=[output]
431
- # )
432
- gr.Markdown("""
433
- ### More details
434
-
435
- The model will generate 12 seconds of audio based on the description you provided.
436
- You can optionally provide a reference audio from which a broad melody will be extracted.
437
- The model will then try to follow both the description and melody provided.
438
- All samples are generated with the `melody` model.
439
-
440
- You can also use your own GPU or a Google Colab by following the instructions on our repo.
441
-
442
- See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
443
- for more details.
444
- """)
445
-
446
- demo.queue().launch(**launch_kwargs)
447
- # demo.queue(max_size=8 * 4).launch(**launch_kwargs, share=True)
448
-
449
-
450
- if __name__ == "__main__":
451
- parser = argparse.ArgumentParser()
452
- parser.add_argument(
453
- '--listen',
454
- type=str,
455
- default='0.0.0.0' if 'SPACE_ID' in os.environ else '127.0.0.1',
456
- help='IP to listen on for connections to Gradio',
457
- )
458
- parser.add_argument(
459
- '--username', type=str, default='', help='Username for authentication'
460
- )
461
- parser.add_argument(
462
- '--password', type=str, default='', help='Password for authentication'
463
- )
464
- parser.add_argument(
465
- '--server_port',
466
- type=int,
467
- default=0,
468
- help='Port to run the server listener on',
469
- )
470
- parser.add_argument(
471
- '--inbrowser', action='store_true', help='Open in browser'
472
- )
473
- parser.add_argument(
474
- '--share', action='store_true', help='Share the gradio UI'
475
- )
476
-
477
- args = parser.parse_args()
478
-
479
- launch_kwargs = {}
480
- launch_kwargs['server_name'] = args.listen
481
-
482
- if args.username and args.password:
483
- launch_kwargs['auth'] = (args.username, args.password)
484
- if args.server_port:
485
- launch_kwargs['server_port'] = args.server_port
486
- if args.inbrowser:
487
- launch_kwargs['inbrowser'] = args.inbrowser
488
- if args.share:
489
- # launch_kwargs['share'] = args.share
490
- launch_kwargs['share'] = True
491
-
492
- global USE_DIFFUSION
493
- # Show the interface
494
- if IS_BATCHED:
495
- USE_DIFFUSION = False
496
- ui_batched(launch_kwargs)
497
- # ui_full(launch_kwargs)
498
- else:
499
- # Space > https://huggingface.co/spaces/MWire/zest-2023
500
- USE_DIFFUSION = False
501
- # ui_full(launch_kwargs)
502
- ui_batched(launch_kwargs)
 
1
+ import gradio
2
+
3
+ def my_inference_function(name):
4
+ return "Hello " + name + "!"
5
+
6
+ gradio_interface = gradio.Interface(
7
+ fn=my_inference_function,
8
+ inputs="text",
9
+ outputs="text",
10
+ examples=[
11
+ ["Jill"],
12
+ ["Sam"]
13
+ ],
14
+ title="REST API with Gradio and Huggingface Spaces",
15
+ description="This is a demo of how to build an AI powered REST API with Gradio and Huggingface Spaces – for free! Based on [this article](https://www.tomsoderlund.com/ai/building-ai-powered-rest-api). See the **Use via API** link at the bottom of this page.",
16
+ article="Test 2023"
17
+ )
18
+
19
+ gradio_interface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app_full.py ADDED
@@ -0,0 +1,502 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # Updated to account for UI changes from https://github.com/rkfg/audiocraft/blob/long/app.py
8
+ # also released under the MIT license.
9
+
10
+ import argparse
11
+ from concurrent.futures import ProcessPoolExecutor
12
+ import os
13
+ from pathlib import Path
14
+ import subprocess as sp
15
+ from tempfile import NamedTemporaryFile
16
+ import time
17
+ import typing as tp
18
+ import warnings
19
+
20
+ import torch
21
+ import gradio as gr
22
+
23
+ from audiocraft.data.audio_utils import convert_audio
24
+ from audiocraft.data.audio import audio_write
25
+ from audiocraft.models import MusicGen, MultiBandDiffusion
26
+
27
+
28
+ MODEL = None # Last used model
29
+ IS_BATCHED = "facebook/MusicGen" in os.environ.get('SPACE_ID', '')
30
+ print(IS_BATCHED)
31
+ MAX_BATCH_SIZE = 12
32
+ BATCHED_DURATION = 15
33
+ INTERRUPTING = False
34
+ MBD = None
35
+ # We have to wrap subprocess call to clean a bit the log when using gr.make_waveform
36
+ _old_call = sp.call
37
+
38
+
39
+ def _call_nostderr(*args, **kwargs):
40
+ # Avoid ffmpeg vomiting on the logs.
41
+ kwargs['stderr'] = sp.DEVNULL
42
+ kwargs['stdout'] = sp.DEVNULL
43
+ _old_call(*args, **kwargs)
44
+
45
+
46
+ sp.call = _call_nostderr
47
+ # Preallocating the pool of processes.
48
+ pool = ProcessPoolExecutor(4)
49
+ pool.__enter__()
50
+
51
+
52
+ def interrupt():
53
+ global INTERRUPTING
54
+ INTERRUPTING = True
55
+
56
+
57
+ class FileCleaner:
58
+ def __init__(self, file_lifetime: float = 3600):
59
+ self.file_lifetime = file_lifetime
60
+ self.files = []
61
+
62
+ def add(self, path: tp.Union[str, Path]):
63
+ self._cleanup()
64
+ self.files.append((time.time(), Path(path)))
65
+
66
+ def _cleanup(self):
67
+ now = time.time()
68
+ for time_added, path in list(self.files):
69
+ if now - time_added > self.file_lifetime:
70
+ if path.exists():
71
+ path.unlink()
72
+ self.files.pop(0)
73
+ else:
74
+ break
75
+
76
+
77
+ file_cleaner = FileCleaner()
78
+
79
+
80
+
81
+ def make_waveform(*args, **kwargs):
82
+ # Further remove some warnings.
83
+ be = time.time()
84
+ with warnings.catch_warnings():
85
+ warnings.simplefilter('ignore')
86
+ out = gr.make_waveform(*args, **kwargs)
87
+ print("Make a video took", time.time() - be)
88
+ return out
89
+
90
+
91
+
92
+ # write a similar function to make_waveform, but for video generated using an image with ans aspect ration of 16:9
93
+
94
+ def make_video(*args, **kwargs):
95
+ # Further remove some warnings.
96
+ be = time.time()
97
+ with warnings.catch_warnings():
98
+ warnings.simplefilter('ignore')
99
+ out = gr.make_video(*args, **kwargs)
100
+ print("Make a video took", time.time() - be)
101
+ return out
102
+
103
+ # write make video functions for other aspect ratios and use ffmpeg to combine them into a single video
104
+
105
+
106
+
107
+
108
+ # def load_model(version='facebook/musicgen-melody'):
109
+ def load_model(version='facebook/musicgen-small'):
110
+ global MODEL
111
+ print("Loading model", version)
112
+ if MODEL is None or MODEL.name != version:
113
+ MODEL = MusicGen.get_pretrained(version)
114
+
115
+
116
+ def load_diffusion():
117
+ global MBD
118
+ if MBD is None:
119
+ print("loading MBD")
120
+ MBD = MultiBandDiffusion.get_mbd_musicgen()
121
+
122
+
123
+ def _do_predictions(texts, melodies, duration, progress=False, **gen_kwargs):
124
+ MODEL.set_generation_params(duration=duration, **gen_kwargs)
125
+ print("new batch", len(texts), texts, [
126
+ None if m is None else (m[0], m[1].shape) for m in melodies])
127
+ be = time.time()
128
+ processed_melodies = []
129
+ target_sr = 32000
130
+ target_ac = 1
131
+ for melody in melodies:
132
+ if melody is None:
133
+ processed_melodies.append(None)
134
+ else:
135
+ sr, melody = melody[0], torch.from_numpy(
136
+ melody[1]).to(MODEL.device).float().t()
137
+ if melody.dim() == 1:
138
+ melody = melody[None]
139
+ melody = melody[..., :int(sr * duration)]
140
+ melody = convert_audio(melody, sr, target_sr, target_ac)
141
+ processed_melodies.append(melody)
142
+
143
+ if any(m is not None for m in processed_melodies):
144
+ outputs = MODEL.generate_with_chroma(
145
+ descriptions=texts,
146
+ melody_wavs=processed_melodies,
147
+ melody_sample_rate=target_sr,
148
+ progress=progress,
149
+ return_tokens=USE_DIFFUSION
150
+ )
151
+ else:
152
+ outputs = MODEL.generate(
153
+ texts, progress=progress, return_tokens=USE_DIFFUSION)
154
+ if USE_DIFFUSION:
155
+ outputs_diffusion = MBD.tokens_to_wav(outputs[1])
156
+ outputs = torch.cat([outputs[0], outputs_diffusion], dim=0)
157
+ outputs = outputs.detach().cpu().float()
158
+ # return outputs
159
+ pending_videos = []
160
+ out_wavs = []
161
+ for output in outputs:
162
+ with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
163
+ audio_write(
164
+ file.name, output, MODEL.sample_rate, strategy="loudness",
165
+ loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
166
+ pending_videos.append(pool.submit(make_waveform, file.name))
167
+ # pending_videos.append(pool.submit(make_video, file.name))
168
+ out_wavs.append(file.name)
169
+ file_cleaner.add(file.name)
170
+ out_videos = [pending_video.result() for pending_video in pending_videos]
171
+ for video in out_videos:
172
+ file_cleaner.add(video)
173
+ print("batch finished", len(texts), time.time() - be)
174
+ print("Tempfiles currently stored: ", len(file_cleaner.files))
175
+ # here I could ipload this to youtube music
176
+ # return out_wavs
177
+ return out_videos, out_wavs
178
+
179
+
180
+ def predict_batched(texts, melodies):
181
+ max_text_length = 512
182
+ texts = [text[:max_text_length] for text in texts]
183
+ load_model('facebook/musicgen-small')
184
+ res = _do_predictions(texts, melodies, BATCHED_DURATION)
185
+ return res
186
+
187
+
188
+ def predict_full(model, decoder, text, melody, duration, topk, topp, temperature, cfg_coef, progress=gr.Progress()):
189
+ global INTERRUPTING
190
+ global USE_DIFFUSION
191
+ INTERRUPTING = False
192
+ if temperature < 0:
193
+ raise gr.Error("Temperature must be >= 0.")
194
+ if topk < 0:
195
+ raise gr.Error("Topk must be non-negative.")
196
+ if topp < 0:
197
+ raise gr.Error("Topp must be non-negative.")
198
+
199
+ topk = int(topk)
200
+ if decoder == "MultiBand_Diffusion":
201
+ USE_DIFFUSION = True
202
+ load_diffusion()
203
+ else:
204
+ USE_DIFFUSION = False
205
+ load_model(model)
206
+
207
+ def _progress(generated, to_generate):
208
+ progress((min(generated, to_generate), to_generate))
209
+ if INTERRUPTING:
210
+ raise gr.Error("Interrupted.")
211
+ MODEL.set_custom_progress_callback(_progress)
212
+
213
+ videos, wavs = _do_predictions(
214
+ [text], [melody], duration, progress=True,
215
+ top_k=topk, top_p=topp, temperature=temperature, cfg_coef=cfg_coef)
216
+ if USE_DIFFUSION:
217
+ return videos[0], wavs[0], videos[1], wavs[1]
218
+ return videos[0], wavs[0], None, None
219
+
220
+
221
+ def toggle_audio_src(choice):
222
+ if choice == "mic":
223
+ return gr.update(source="microphone", value=None, label="Microphone")
224
+ else:
225
+ return gr.update(source="upload", value=None, label="File")
226
+
227
+
228
+ def toggle_diffusion(choice):
229
+ if choice == "MultiBand_Diffusion":
230
+ return [gr.update(visible=True)] * 2
231
+ else:
232
+ return [gr.update(visible=False)] * 2
233
+
234
+
235
+ def ui_full(launch_kwargs):
236
+ with gr.Blocks() as interface:
237
+ gr.Markdown(
238
+ """
239
+ # MusicGen
240
+ This is your private demo for [MusicGen](https://github.com/facebookresearch/audiocraft),
241
+ a simple and controllable model for music generation
242
+ presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284)
243
+ """
244
+ )
245
+ with gr.Row():
246
+ with gr.Column():
247
+ with gr.Row():
248
+ text = gr.Text(
249
+ label="Input Text", value="Chill and relaxing downtempo for the shower", interactive=True)
250
+ with gr.Column():
251
+ radio = gr.Radio(["file", "mic"], value="file",
252
+ label="Condition on a melody (optional) File or Mic")
253
+ melody = gr.Audio(source="upload", type="numpy", label="File",
254
+ interactive=True, elem_id="melody-input")
255
+ with gr.Row():
256
+ submit = gr.Button("Submit")
257
+ # Adapted from https://github.com/rkfg/audiocraft/blob/long/app.py, MIT license.
258
+ _ = gr.Button("Interrupt").click(fn=interrupt, queue=False)
259
+ with gr.Row():
260
+ model = gr.Radio(["facebook/musicgen-melody", "facebook/musicgen-medium", "facebook/musicgen-small",
261
+ "facebook/musicgen-large"],
262
+ label="Model", value="facebook/musicgen-small", interactive=True)
263
+ with gr.Row():
264
+ decoder = gr.Radio(["Default", "MultiBand_Diffusion"],
265
+ label="Decoder", value="Default", interactive=True)
266
+ with gr.Row():
267
+ duration = gr.Slider(
268
+ minimum=1, maximum=120, value=20, label="Duration", interactive=True)
269
+ with gr.Row():
270
+ topk = gr.Number(label="Top-k", value=250,
271
+ interactive=True)
272
+ topp = gr.Number(label="Top-p", value=0, interactive=True)
273
+ temperature = gr.Number(
274
+ label="Temperature", value=1.0, interactive=True)
275
+ cfg_coef = gr.Number(
276
+ label="Classifier Free Guidance", value=3.0, interactive=True)
277
+ with gr.Column():
278
+ output = gr.Video(label="Generated Music")
279
+ audio_output = gr.Audio(
280
+ label="Generated Music (wav)", type='filepath')
281
+ diffusion_output = gr.Video(
282
+ label="MultiBand Diffusion Decoder")
283
+ audio_diffusion = gr.Audio(
284
+ label="MultiBand Diffusion Decoder (wav)", type='filepath')
285
+ submit.click(toggle_diffusion, decoder, [diffusion_output, audio_diffusion], queue=False,
286
+ show_progress=False, api_name="generate").then(predict_full, inputs=[model, decoder, text, melody, duration, topk, topp,
287
+ temperature, cfg_coef],
288
+ outputs=[output, audio_output, diffusion_output, audio_diffusion])
289
+ radio.change(toggle_audio_src, radio, [
290
+ melody], queue=False, show_progress=False)
291
+
292
+ gr.Examples(
293
+ fn=predict_full,
294
+ examples=[
295
+ [
296
+ "An 80s driving pop song with heavy drums and synth pads in the background",
297
+ "./assets/bach.mp3",
298
+ "facebook/musicgen-melody",
299
+ "Default"
300
+ ],
301
+ [
302
+ "90s rock song with electric guitar and heavy drums",
303
+ None,
304
+ "facebook/musicgen-medium",
305
+ "Default"
306
+ ],
307
+ [
308
+ "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions",
309
+ "./assets/bach.mp3",
310
+ "facebook/musicgen-melody",
311
+ "Default"
312
+ ],
313
+ [
314
+ "lofi slow bpm electro chill with organic samples",
315
+ None,
316
+ "facebook/musicgen-medium",
317
+ "Default"
318
+ ],
319
+ [
320
+ "Punk rock with loud drum and power guitar",
321
+ None,
322
+ "facebook/musicgen-medium",
323
+ "MultiBand_Diffusion"
324
+ ],
325
+ ],
326
+ inputs=[text, melody, model, decoder],
327
+ outputs=[output]
328
+ )
329
+ gr.Markdown(
330
+ """
331
+ ### More details
332
+
333
+ The model will generate a short music extract based on the description you provided.
334
+ The model can generate up to 30 seconds of audio in one pass. It is now possible
335
+ to extend the generation by feeding back the end of the previous chunk of audio.
336
+ This can take a long time, and the model might lose consistency. The model might also
337
+ decide at arbitrary positions that the song ends.
338
+
339
+ **WARNING:** Choosing long durations will take a long time to generate (2min might take ~10min).
340
+ An overlap of 12 seconds is kept with the previously generated chunk, and 18 "new" seconds
341
+ are generated each time.
342
+
343
+ We present 4 model variations:
344
+ 1. facebook/musicgen-melody -- a music generation model capable of generating music condition
345
+ on text and melody inputs. **Note**, you can also use text only.
346
+ 2. facebook/musicgen-small -- a 300M transformer decoder conditioned on text only.
347
+ 3. facebook/musicgen-medium -- a 1.5B transformer decoder conditioned on text only.
348
+ 4. facebook/musicgen-large -- a 3.3B transformer decoder conditioned on text only.
349
+
350
+ We also present two way of decoding the audio tokens
351
+ 1. Use the default GAN based compression model
352
+ 2. Use MultiBand Diffusion from (paper linknano )
353
+
354
+ When using `facebook/musicgen-melody`, you can optionally provide a reference audio from
355
+ which a broad melody will be extracted. The model will then try to follow both
356
+ the description and melody provided.
357
+
358
+ You can also use your own GPU or a Google Colab by following the instructions on our repo.
359
+ See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
360
+ for more details.
361
+ """
362
+ )
363
+
364
+ interface.queue().launch(**launch_kwargs)
365
+ # interface.queue().launch(**launch_kwargs, share=True)
366
+
367
+
368
+ def ui_batched(launch_kwargs):
369
+ with gr.Blocks() as demo:
370
+ gr.Markdown(
371
+ """
372
+ # MusicGen
373
+
374
+ This is the demo for [MusicGen](https://github.com/facebookresearch/audiocraft),
375
+ a simple and controllable model for music generation
376
+ presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284).
377
+ <br/>
378
+ <a href="https://huggingface.co/spaces/facebook/MusicGen?duplicate=true"
379
+ style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
380
+ <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;"
381
+ src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
382
+ for longer sequences, more control and no queue.</p>
383
+ """
384
+ )
385
+ with gr.Row():
386
+ with gr.Column():
387
+ with gr.Row():
388
+ text = gr.Text(label="Describe your music",
389
+ value="Chill and relaxing downtempo for the shower", lines=2, interactive=True)
390
+ with gr.Column():
391
+ radio = gr.Radio(["file", "mic"], value="file",
392
+ label="Condition on a melody (optional) File or Mic")
393
+ melody = gr.Audio(source="upload", type="numpy", label="File",
394
+ interactive=True, elem_id="melody-input")
395
+ with gr.Row():
396
+ submit = gr.Button("Generate")
397
+ with gr.Column():
398
+ output = gr.Video(label="Generated Music")
399
+ audio_output = gr.Audio(
400
+ label="Generated Music (wav)", type='filepath')
401
+ submit.click(predict_batched, inputs=[text, melody],
402
+ outputs=[output, audio_output], batch=True, max_batch_size=MAX_BATCH_SIZE, api_name="create")
403
+ radio.change(toggle_audio_src, radio, [
404
+ melody], queue=False, show_progress=False)
405
+ # gr.Examples(
406
+ # fn=predict_batched,
407
+ # examples=[
408
+ # [
409
+ # "An 80s driving pop song with heavy drums and synth pads in the background",
410
+ # "./assets/bach.mp3",
411
+ # ],
412
+ # [
413
+ # "A cheerful country song with acoustic guitars",
414
+ # "./assets/bolero_ravel.mp3",
415
+ # ],
416
+ # [
417
+ # "90s rock song with electric guitar and heavy drums",
418
+ # None,
419
+ # ],
420
+ # [
421
+ # "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 130",
422
+ # "./assets/bach.mp3",
423
+ # ],
424
+ # [
425
+ # "lofi slow bpm electro chill with organic samples",
426
+ # None,
427
+ # ],
428
+ # ],
429
+ # inputs=[text, melody],
430
+ # outputs=[output]
431
+ # )
432
+ gr.Markdown("""
433
+ ### More details
434
+
435
+ The model will generate 12 seconds of audio based on the description you provided.
436
+ You can optionally provide a reference audio from which a broad melody will be extracted.
437
+ The model will then try to follow both the description and melody provided.
438
+ All samples are generated with the `melody` model.
439
+
440
+ You can also use your own GPU or a Google Colab by following the instructions on our repo.
441
+
442
+ See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
443
+ for more details.
444
+ """)
445
+
446
+ demo.queue().launch(**launch_kwargs)
447
+ # demo.queue(max_size=8 * 4).launch(**launch_kwargs, share=True)
448
+
449
+
450
+ if __name__ == "__main__":
451
+ parser = argparse.ArgumentParser()
452
+ parser.add_argument(
453
+ '--listen',
454
+ type=str,
455
+ default='0.0.0.0' if 'SPACE_ID' in os.environ else '127.0.0.1',
456
+ help='IP to listen on for connections to Gradio',
457
+ )
458
+ parser.add_argument(
459
+ '--username', type=str, default='', help='Username for authentication'
460
+ )
461
+ parser.add_argument(
462
+ '--password', type=str, default='', help='Password for authentication'
463
+ )
464
+ parser.add_argument(
465
+ '--server_port',
466
+ type=int,
467
+ default=0,
468
+ help='Port to run the server listener on',
469
+ )
470
+ parser.add_argument(
471
+ '--inbrowser', action='store_true', help='Open in browser'
472
+ )
473
+ parser.add_argument(
474
+ '--share', action='store_true', help='Share the gradio UI'
475
+ )
476
+
477
+ args = parser.parse_args()
478
+
479
+ launch_kwargs = {}
480
+ launch_kwargs['server_name'] = args.listen
481
+
482
+ if args.username and args.password:
483
+ launch_kwargs['auth'] = (args.username, args.password)
484
+ if args.server_port:
485
+ launch_kwargs['server_port'] = args.server_port
486
+ if args.inbrowser:
487
+ launch_kwargs['inbrowser'] = args.inbrowser
488
+ if args.share:
489
+ # launch_kwargs['share'] = args.share
490
+ launch_kwargs['share'] = True
491
+
492
+ global USE_DIFFUSION
493
+ # Show the interface
494
+ if IS_BATCHED:
495
+ USE_DIFFUSION = False
496
+ ui_batched(launch_kwargs)
497
+ # ui_full(launch_kwargs)
498
+ else:
499
+ # Space > https://huggingface.co/spaces/MWire/zest-2023
500
+ USE_DIFFUSION = False
501
+ # ui_full(launch_kwargs)
502
+ ui_batched(launch_kwargs)