yozozaya commited on
Commit
469c445
·
1 Parent(s): fa7d2b1

Add application file

Browse files
Files changed (3) hide show
  1. .gitignore +62 -0
  2. app.py +463 -0
  3. requirements.txt +22 -0
.gitignore ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # macOS dir files
10
+ .DS_Store
11
+
12
+ # Distribution / packaging
13
+ .Python
14
+ env/
15
+ build/
16
+ develop-eggs/
17
+ dist/
18
+ downloads/
19
+ eggs/
20
+ .eggs/
21
+ lib/
22
+ lib64/
23
+ parts/
24
+ sdist/
25
+ var/
26
+ wheels/
27
+ *.egg-info/
28
+ .installed.cfg
29
+ *.egg
30
+ .ipynb_checkpoints
31
+
32
+ # Tests and linter
33
+ .pytest_cache/
34
+ .mypy_cache/
35
+ .coverage
36
+
37
+ # docs
38
+ /api_docs
39
+
40
+ # dotenv
41
+ .env
42
+ .envrc
43
+
44
+ # virtualenv
45
+ .venv
46
+ venv/
47
+ ENV/
48
+
49
+ # egs with manifest files
50
+ egs/*
51
+ !egs/example
52
+ # local datasets
53
+ dataset/*
54
+ !dataset/example
55
+
56
+ # personal notebooks & scripts
57
+ */local_scripts
58
+ */notes
59
+ .vscode/
60
+ /notebooks
61
+ /local_scripts
62
+ /notes
app.py ADDED
@@ -0,0 +1,463 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # Updated to account for UI changes from https://github.com/rkfg/audiocraft/blob/long/app.py
8
+ # also released under the MIT license.
9
+
10
+ import argparse
11
+ from concurrent.futures import ProcessPoolExecutor
12
+ import os
13
+ from pathlib import Path
14
+ import subprocess as sp
15
+ from tempfile import NamedTemporaryFile
16
+ import time
17
+ import typing as tp
18
+ import warnings
19
+
20
+ import torch
21
+ import gradio as gr
22
+
23
+ from audiocraft.data.audio_utils import convert_audio
24
+ from audiocraft.data.audio import audio_write
25
+ from audiocraft.models import MusicGen, MultiBandDiffusion
26
+
27
+
28
+ MODEL = None # Last used model
29
+ IS_BATCHED = "facebook/MusicGen" in os.environ.get('SPACE_ID', '')
30
+ print(IS_BATCHED)
31
+ MAX_BATCH_SIZE = 12
32
+ BATCHED_DURATION = 15
33
+ INTERRUPTING = False
34
+ MBD = None
35
+ # We have to wrap subprocess call to clean a bit the log when using gr.make_waveform
36
+ _old_call = sp.call
37
+
38
+
39
+ def _call_nostderr(*args, **kwargs):
40
+ # Avoid ffmpeg vomiting on the logs.
41
+ kwargs['stderr'] = sp.DEVNULL
42
+ kwargs['stdout'] = sp.DEVNULL
43
+ _old_call(*args, **kwargs)
44
+
45
+
46
+ sp.call = _call_nostderr
47
+ # Preallocating the pool of processes.
48
+ pool = ProcessPoolExecutor(4)
49
+ pool.__enter__()
50
+
51
+
52
+ def interrupt():
53
+ global INTERRUPTING
54
+ INTERRUPTING = True
55
+
56
+
57
+ class FileCleaner:
58
+ def __init__(self, file_lifetime: float = 3600):
59
+ self.file_lifetime = file_lifetime
60
+ self.files = []
61
+
62
+ def add(self, path: tp.Union[str, Path]):
63
+ self._cleanup()
64
+ self.files.append((time.time(), Path(path)))
65
+
66
+ def _cleanup(self):
67
+ now = time.time()
68
+ for time_added, path in list(self.files):
69
+ if now - time_added > self.file_lifetime:
70
+ if path.exists():
71
+ path.unlink()
72
+ self.files.pop(0)
73
+ else:
74
+ break
75
+
76
+
77
+ file_cleaner = FileCleaner()
78
+
79
+
80
+ def make_waveform(*args, **kwargs):
81
+ # Further remove some warnings.
82
+ be = time.time()
83
+ with warnings.catch_warnings():
84
+ warnings.simplefilter('ignore')
85
+ out = gr.make_waveform(*args, **kwargs)
86
+ print("Make a video took", time.time() - be)
87
+ return out
88
+
89
+
90
+ def load_model(version='facebook/musicgen-melody'):
91
+ global MODEL
92
+ print("Loading model", version)
93
+ if MODEL is None or MODEL.name != version:
94
+ MODEL = MusicGen.get_pretrained(version)
95
+
96
+
97
+ def load_diffusion():
98
+ global MBD
99
+ if MBD is None:
100
+ print("loading MBD")
101
+ MBD = MultiBandDiffusion.get_mbd_musicgen()
102
+
103
+
104
+ def _do_predictions(texts, melodies, duration, progress=False, **gen_kwargs):
105
+ MODEL.set_generation_params(duration=duration, **gen_kwargs)
106
+ print("new batch", len(texts), texts, [None if m is None else (m[0], m[1].shape) for m in melodies])
107
+ be = time.time()
108
+ processed_melodies = []
109
+ target_sr = 32000
110
+ target_ac = 1
111
+ for melody in melodies:
112
+ if melody is None:
113
+ processed_melodies.append(None)
114
+ else:
115
+ sr, melody = melody[0], torch.from_numpy(melody[1]).to(MODEL.device).float().t()
116
+ if melody.dim() == 1:
117
+ melody = melody[None]
118
+ melody = melody[..., :int(sr * duration)]
119
+ melody = convert_audio(melody, sr, target_sr, target_ac)
120
+ processed_melodies.append(melody)
121
+
122
+ if any(m is not None for m in processed_melodies):
123
+ outputs = MODEL.generate_with_chroma(
124
+ descriptions=texts,
125
+ melody_wavs=processed_melodies,
126
+ melody_sample_rate=target_sr,
127
+ progress=progress,
128
+ return_tokens=USE_DIFFUSION
129
+ )
130
+ else:
131
+ outputs = MODEL.generate(texts, progress=progress, return_tokens=USE_DIFFUSION)
132
+ if USE_DIFFUSION:
133
+ outputs_diffusion = MBD.tokens_to_wav(outputs[1])
134
+ outputs = torch.cat([outputs[0], outputs_diffusion], dim=0)
135
+ outputs = outputs.detach().cpu().float()
136
+ pending_videos = []
137
+ out_wavs = []
138
+ for output in outputs:
139
+ with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
140
+ audio_write(
141
+ file.name, output, MODEL.sample_rate, strategy="loudness",
142
+ loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
143
+ pending_videos.append(pool.submit(make_waveform, file.name))
144
+ out_wavs.append(file.name)
145
+ file_cleaner.add(file.name)
146
+ out_videos = [pending_video.result() for pending_video in pending_videos]
147
+ for video in out_videos:
148
+ file_cleaner.add(video)
149
+ print("batch finished", len(texts), time.time() - be)
150
+ print("Tempfiles currently stored: ", len(file_cleaner.files))
151
+ return out_videos, out_wavs
152
+
153
+
154
+ def predict_batched(texts, melodies):
155
+ max_text_length = 512
156
+ texts = [text[:max_text_length] for text in texts]
157
+ load_model('facebook/musicgen-melody')
158
+ res = _do_predictions(texts, melodies, BATCHED_DURATION)
159
+ return res
160
+
161
+
162
+ def predict_full(model, decoder, text, melody, duration, topk, topp, temperature, cfg_coef, progress=gr.Progress()):
163
+ global INTERRUPTING
164
+ global USE_DIFFUSION
165
+ INTERRUPTING = False
166
+ if temperature < 0:
167
+ raise gr.Error("Temperature must be >= 0.")
168
+ if topk < 0:
169
+ raise gr.Error("Topk must be non-negative.")
170
+ if topp < 0:
171
+ raise gr.Error("Topp must be non-negative.")
172
+
173
+ topk = int(topk)
174
+ if decoder == "MultiBand_Diffusion":
175
+ USE_DIFFUSION = True
176
+ load_diffusion()
177
+ else:
178
+ USE_DIFFUSION = False
179
+ load_model(model)
180
+
181
+ def _progress(generated, to_generate):
182
+ progress((min(generated, to_generate), to_generate))
183
+ if INTERRUPTING:
184
+ raise gr.Error("Interrupted.")
185
+ MODEL.set_custom_progress_callback(_progress)
186
+
187
+ videos, wavs = _do_predictions(
188
+ [text], [melody], duration, progress=True,
189
+ top_k=topk, top_p=topp, temperature=temperature, cfg_coef=cfg_coef)
190
+ if USE_DIFFUSION:
191
+ return videos[0], wavs[0], videos[1], wavs[1]
192
+ return videos[0], wavs[0], None, None
193
+
194
+
195
+ def toggle_audio_src(choice):
196
+ if choice == "mic":
197
+ return gr.update(source="microphone", value=None, label="Microphone")
198
+ else:
199
+ return gr.update(source="upload", value=None, label="File")
200
+
201
+
202
+ def toggle_diffusion(choice):
203
+ if choice == "MultiBand_Diffusion":
204
+ return [gr.update(visible=True)] * 2
205
+ else:
206
+ return [gr.update(visible=False)] * 2
207
+
208
+
209
+ def ui_full(launch_kwargs):
210
+ with gr.Blocks() as interface:
211
+ gr.Markdown(
212
+ """
213
+ # MusicGen
214
+ This is your private demo for [MusicGen](https://github.com/facebookresearch/audiocraft),
215
+ a simple and controllable model for music generation
216
+ presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284)
217
+ """
218
+ )
219
+ with gr.Row():
220
+ with gr.Column():
221
+ with gr.Row():
222
+ text = gr.Text(label="Input Text", interactive=True)
223
+ with gr.Column():
224
+ radio = gr.Radio(["file", "mic"], value="file",
225
+ label="Condition on a melody (optional) File or Mic")
226
+ melody = gr.Audio(source="upload", type="numpy", label="File",
227
+ interactive=True, elem_id="melody-input")
228
+ with gr.Row():
229
+ submit = gr.Button("Submit")
230
+ # Adapted from https://github.com/rkfg/audiocraft/blob/long/app.py, MIT license.
231
+ _ = gr.Button("Interrupt").click(fn=interrupt, queue=False)
232
+ with gr.Row():
233
+ model = gr.Radio(["facebook/musicgen-melody", "facebook/musicgen-medium", "facebook/musicgen-small",
234
+ "facebook/musicgen-large"],
235
+ label="Model", value="facebook/musicgen-melody", interactive=True)
236
+ with gr.Row():
237
+ decoder = gr.Radio(["Default", "MultiBand_Diffusion"],
238
+ label="Decoder", value="Default", interactive=True)
239
+ with gr.Row():
240
+ duration = gr.Slider(minimum=1, maximum=120, value=10, label="Duration", interactive=True)
241
+ with gr.Row():
242
+ topk = gr.Number(label="Top-k", value=250, interactive=True)
243
+ topp = gr.Number(label="Top-p", value=0, interactive=True)
244
+ temperature = gr.Number(label="Temperature", value=1.0, interactive=True)
245
+ cfg_coef = gr.Number(label="Classifier Free Guidance", value=3.0, interactive=True)
246
+ with gr.Column():
247
+ output = gr.Video(label="Generated Music")
248
+ audio_output = gr.Audio(label="Generated Music (wav)", type='filepath')
249
+ diffusion_output = gr.Video(label="MultiBand Diffusion Decoder")
250
+ audio_diffusion = gr.Audio(label="MultiBand Diffusion Decoder (wav)", type='filepath')
251
+ submit.click(toggle_diffusion, decoder, [diffusion_output, audio_diffusion], queue=False,
252
+ show_progress=False).then(predict_full, inputs=[model, decoder, text, melody, duration, topk, topp,
253
+ temperature, cfg_coef],
254
+ outputs=[output, audio_output, diffusion_output, audio_diffusion])
255
+ radio.change(toggle_audio_src, radio, [melody], queue=False, show_progress=False)
256
+
257
+ gr.Examples(
258
+ fn=predict_full,
259
+ examples=[
260
+ [
261
+ "An 80s driving pop song with heavy drums and synth pads in the background",
262
+ "./assets/bach.mp3",
263
+ "facebook/musicgen-melody",
264
+ "Default"
265
+ ],
266
+ [
267
+ "A cheerful country song with acoustic guitars",
268
+ "./assets/bolero_ravel.mp3",
269
+ "facebook/musicgen-melody",
270
+ "Default"
271
+ ],
272
+ [
273
+ "90s rock song with electric guitar and heavy drums",
274
+ None,
275
+ "facebook/musicgen-medium",
276
+ "Default"
277
+ ],
278
+ [
279
+ "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions",
280
+ "./assets/bach.mp3",
281
+ "facebook/musicgen-melody",
282
+ "Default"
283
+ ],
284
+ [
285
+ "lofi slow bpm electro chill with organic samples",
286
+ None,
287
+ "facebook/musicgen-medium",
288
+ "Default"
289
+ ],
290
+ [
291
+ "Punk rock with loud drum and power guitar",
292
+ None,
293
+ "facebook/musicgen-medium",
294
+ "MultiBand_Diffusion"
295
+ ],
296
+ ],
297
+ inputs=[text, melody, model, decoder],
298
+ outputs=[output]
299
+ )
300
+ gr.Markdown(
301
+ """
302
+ ### More details
303
+
304
+ The model will generate a short music extract based on the description you provided.
305
+ The model can generate up to 30 seconds of audio in one pass. It is now possible
306
+ to extend the generation by feeding back the end of the previous chunk of audio.
307
+ This can take a long time, and the model might lose consistency. The model might also
308
+ decide at arbitrary positions that the song ends.
309
+
310
+ **WARNING:** Choosing long durations will take a long time to generate (2min might take ~10min).
311
+ An overlap of 12 seconds is kept with the previously generated chunk, and 18 "new" seconds
312
+ are generated each time.
313
+
314
+ We present 4 model variations:
315
+ 1. facebook/musicgen-melody -- a music generation model capable of generating music condition
316
+ on text and melody inputs. **Note**, you can also use text only.
317
+ 2. facebook/musicgen-small -- a 300M transformer decoder conditioned on text only.
318
+ 3. facebook/musicgen-medium -- a 1.5B transformer decoder conditioned on text only.
319
+ 4. facebook/musicgen-large -- a 3.3B transformer decoder conditioned on text only.
320
+
321
+ We also present two way of decoding the audio tokens
322
+ 1. Use the default GAN based compression model
323
+ 2. Use MultiBand Diffusion from (paper linknano )
324
+
325
+ When using `facebook/musicgen-melody`, you can optionally provide a reference audio from
326
+ which a broad melody will be extracted. The model will then try to follow both
327
+ the description and melody provided.
328
+
329
+ You can also use your own GPU or a Google Colab by following the instructions on our repo.
330
+ See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
331
+ for more details.
332
+ """
333
+ )
334
+
335
+ interface.queue().launch(**launch_kwargs)
336
+
337
+
338
+ def ui_batched(launch_kwargs):
339
+ with gr.Blocks() as demo:
340
+ gr.Markdown(
341
+ """
342
+ # MusicGen
343
+
344
+ This is the demo for [MusicGen](https://github.com/facebookresearch/audiocraft),
345
+ a simple and controllable model for music generation
346
+ presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284).
347
+ <br/>
348
+ <a href="https://huggingface.co/spaces/facebook/MusicGen?duplicate=true"
349
+ style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
350
+ <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;"
351
+ src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
352
+ for longer sequences, more control and no queue.</p>
353
+ """
354
+ )
355
+ with gr.Row():
356
+ with gr.Column():
357
+ with gr.Row():
358
+ text = gr.Text(label="Describe your music", lines=2, interactive=True)
359
+ with gr.Column():
360
+ radio = gr.Radio(["file", "mic"], value="file",
361
+ label="Condition on a melody (optional) File or Mic")
362
+ melody = gr.Audio(source="upload", type="numpy", label="File",
363
+ interactive=True, elem_id="melody-input")
364
+ with gr.Row():
365
+ submit = gr.Button("Generate")
366
+ with gr.Column():
367
+ output = gr.Video(label="Generated Music")
368
+ audio_output = gr.Audio(label="Generated Music (wav)", type='filepath')
369
+ submit.click(predict_batched, inputs=[text, melody],
370
+ outputs=[output, audio_output], batch=True, max_batch_size=MAX_BATCH_SIZE)
371
+ radio.change(toggle_audio_src, radio, [melody], queue=False, show_progress=False)
372
+ gr.Examples(
373
+ fn=predict_batched,
374
+ examples=[
375
+ [
376
+ "An 80s driving pop song with heavy drums and synth pads in the background",
377
+ "./assets/bach.mp3",
378
+ ],
379
+ [
380
+ "A cheerful country song with acoustic guitars",
381
+ "./assets/bolero_ravel.mp3",
382
+ ],
383
+ [
384
+ "90s rock song with electric guitar and heavy drums",
385
+ None,
386
+ ],
387
+ [
388
+ "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 130",
389
+ "./assets/bach.mp3",
390
+ ],
391
+ [
392
+ "lofi slow bpm electro chill with organic samples",
393
+ None,
394
+ ],
395
+ ],
396
+ inputs=[text, melody],
397
+ outputs=[output]
398
+ )
399
+ gr.Markdown("""
400
+ ### More details
401
+
402
+ The model will generate 12 seconds of audio based on the description you provided.
403
+ You can optionally provide a reference audio from which a broad melody will be extracted.
404
+ The model will then try to follow both the description and melody provided.
405
+ All samples are generated with the `melody` model.
406
+
407
+ You can also use your own GPU or a Google Colab by following the instructions on our repo.
408
+
409
+ See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
410
+ for more details.
411
+ """)
412
+
413
+ demo.queue(max_size=8 * 4).launch(**launch_kwargs)
414
+
415
+
416
+ if __name__ == "__main__":
417
+ parser = argparse.ArgumentParser()
418
+ parser.add_argument(
419
+ '--listen',
420
+ type=str,
421
+ default='0.0.0.0' if 'SPACE_ID' in os.environ else '127.0.0.1',
422
+ help='IP to listen on for connections to Gradio',
423
+ )
424
+ parser.add_argument(
425
+ '--username', type=str, default='', help='Username for authentication'
426
+ )
427
+ parser.add_argument(
428
+ '--password', type=str, default='', help='Password for authentication'
429
+ )
430
+ parser.add_argument(
431
+ '--server_port',
432
+ type=int,
433
+ default=0,
434
+ help='Port to run the server listener on',
435
+ )
436
+ parser.add_argument(
437
+ '--inbrowser', action='store_true', help='Open in browser'
438
+ )
439
+ parser.add_argument(
440
+ '--share', action='store_true', help='Share the gradio UI'
441
+ )
442
+
443
+ args = parser.parse_args()
444
+
445
+ launch_kwargs = {}
446
+ launch_kwargs['server_name'] = args.listen
447
+
448
+ if args.username and args.password:
449
+ launch_kwargs['auth'] = (args.username, args.password)
450
+ if args.server_port:
451
+ launch_kwargs['server_port'] = args.server_port
452
+ if args.inbrowser:
453
+ launch_kwargs['inbrowser'] = args.inbrowser
454
+ if args.share:
455
+ launch_kwargs['share'] = args.share
456
+
457
+ # Show the interface
458
+ if IS_BATCHED:
459
+ global USE_DIFFUSION
460
+ USE_DIFFUSION = False
461
+ ui_batched(launch_kwargs)
462
+ else:
463
+ ui_full(launch_kwargs)
requirements.txt ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # please make sure you have already a pytorch install that is cuda enabled!
2
+ av
3
+ einops
4
+ flashy>=0.0.1
5
+ hydra-core>=1.1
6
+ hydra_colorlog
7
+ julius
8
+ num2words
9
+ numpy
10
+ sentencepiece
11
+ spacy==3.5.2
12
+ torch>=2.0.0
13
+ torchaudio>=2.0.0
14
+ huggingface_hub
15
+ tqdm
16
+ transformers>=4.31.0 # need Encodec there.
17
+ xformers
18
+ demucs
19
+ librosa
20
+ gradio
21
+ torchmetrics
22
+ encodec