Update app.py
Browse files
app.py
CHANGED
@@ -1,22 +1,16 @@
|
|
|
|
|
|
|
|
|
|
1 |
from queue import Queue
|
2 |
from threading import Thread
|
3 |
from typing import Optional
|
4 |
-
|
5 |
-
import numpy as np
|
6 |
-
import torch
|
7 |
-
|
8 |
from transformers import MusicgenForConditionalGeneration, MusicgenProcessor, set_seed
|
9 |
from transformers.generation.streamers import BaseStreamer
|
10 |
|
11 |
-
import gradio as gr
|
12 |
-
import spaces
|
13 |
-
|
14 |
-
|
15 |
model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
|
16 |
processor = MusicgenProcessor.from_pretrained("facebook/musicgen-small")
|
17 |
-
|
18 |
title = "MusicGen Streaming"
|
19 |
-
|
20 |
description = """
|
21 |
Stream the outputs of the MusicGen text-to-music model by playing the generated audio as soon as the first chunk is ready.
|
22 |
Demo uses [MusicGen Small](https://huggingface.co/facebook/musicgen-small) in the 🤗 Transformers library. Note that the
|
@@ -30,18 +24,6 @@ At each decoding step, the model generates a new set of audio codes, conditional
|
|
30 |
frame rate of the [EnCodec model](https://huggingface.co/facebook/encodec_32khz) used to decode the generated codes to audio waveform,
|
31 |
each set of generated audio codes corresponds to 0.02 seconds. This means we require a total of 1000 decoding steps to generate
|
32 |
20 seconds of audio.
|
33 |
-
Rather than waiting for the entire audio sequence to be generated, which would require the full 1000 decoding steps, we can start
|
34 |
-
playing the audio after a specified number of decoding steps have been reached, a techinque known as [*streaming*](https://huggingface.co/docs/transformers/main/en/generation_strategies#streaming).
|
35 |
-
For example, after 250 steps we have the first 5 seconds of audio ready, and so can play this without waiting for the remaining
|
36 |
-
750 decoding steps to be complete. As we continue to generate with the MusicGen model, we append new chunks of generated audio
|
37 |
-
to our output waveform on-the-fly. After the full 1000 decoding steps, the generated audio is complete, and is composed of four
|
38 |
-
chunks of audio, each corresponding to 250 tokens.
|
39 |
-
This method of playing incremental generations reduces the latency of the MusicGen model from the total time to generate 1000 tokens,
|
40 |
-
to the time taken to play the first chunk of audio (250 tokens). This can result in significant improvements to perceived latency,
|
41 |
-
particularly when the chunk size is chosen to be small. In practice, the chunk size should be tuned to your device: using a
|
42 |
-
smaller chunk size will mean that the first chunk is ready faster, but should not be chosen so small that the model generates slower
|
43 |
-
than the time it takes to play the audio.
|
44 |
-
For details on how the streaming class works, check out the source code for the [MusicgenStreamer](https://huggingface.co/spaces/sanchit-gandhi/musicgen-streaming/blob/main/app.py#L52).
|
45 |
"""
|
46 |
|
47 |
|
@@ -229,5 +211,4 @@ demo = gr.Interface(
|
|
229 |
cache_examples=False,
|
230 |
)
|
231 |
|
232 |
-
|
233 |
demo.queue().launch()
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
import gradio as gr
|
4 |
+
import spaces
|
5 |
from queue import Queue
|
6 |
from threading import Thread
|
7 |
from typing import Optional
|
|
|
|
|
|
|
|
|
8 |
from transformers import MusicgenForConditionalGeneration, MusicgenProcessor, set_seed
|
9 |
from transformers.generation.streamers import BaseStreamer
|
10 |
|
|
|
|
|
|
|
|
|
11 |
model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
|
12 |
processor = MusicgenProcessor.from_pretrained("facebook/musicgen-small")
|
|
|
13 |
title = "MusicGen Streaming"
|
|
|
14 |
description = """
|
15 |
Stream the outputs of the MusicGen text-to-music model by playing the generated audio as soon as the first chunk is ready.
|
16 |
Demo uses [MusicGen Small](https://huggingface.co/facebook/musicgen-small) in the 🤗 Transformers library. Note that the
|
|
|
24 |
frame rate of the [EnCodec model](https://huggingface.co/facebook/encodec_32khz) used to decode the generated codes to audio waveform,
|
25 |
each set of generated audio codes corresponds to 0.02 seconds. This means we require a total of 1000 decoding steps to generate
|
26 |
20 seconds of audio.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
"""
|
28 |
|
29 |
|
|
|
211 |
cache_examples=False,
|
212 |
)
|
213 |
|
|
|
214 |
demo.queue().launch()
|