awacke1 commited on
Commit
feefbf0
·
1 Parent(s): 1b1197b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -23
app.py CHANGED
@@ -1,22 +1,16 @@
 
 
 
 
1
  from queue import Queue
2
  from threading import Thread
3
  from typing import Optional
4
-
5
- import numpy as np
6
- import torch
7
-
8
  from transformers import MusicgenForConditionalGeneration, MusicgenProcessor, set_seed
9
  from transformers.generation.streamers import BaseStreamer
10
 
11
- import gradio as gr
12
- import spaces
13
-
14
-
15
  model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
16
  processor = MusicgenProcessor.from_pretrained("facebook/musicgen-small")
17
-
18
  title = "MusicGen Streaming"
19
-
20
  description = """
21
  Stream the outputs of the MusicGen text-to-music model by playing the generated audio as soon as the first chunk is ready.
22
  Demo uses [MusicGen Small](https://huggingface.co/facebook/musicgen-small) in the 🤗 Transformers library. Note that the
@@ -30,18 +24,6 @@ At each decoding step, the model generates a new set of audio codes, conditional
30
  frame rate of the [EnCodec model](https://huggingface.co/facebook/encodec_32khz) used to decode the generated codes to audio waveform,
31
  each set of generated audio codes corresponds to 0.02 seconds. This means we require a total of 1000 decoding steps to generate
32
  20 seconds of audio.
33
- Rather than waiting for the entire audio sequence to be generated, which would require the full 1000 decoding steps, we can start
34
- playing the audio after a specified number of decoding steps have been reached, a techinque known as [*streaming*](https://huggingface.co/docs/transformers/main/en/generation_strategies#streaming).
35
- For example, after 250 steps we have the first 5 seconds of audio ready, and so can play this without waiting for the remaining
36
- 750 decoding steps to be complete. As we continue to generate with the MusicGen model, we append new chunks of generated audio
37
- to our output waveform on-the-fly. After the full 1000 decoding steps, the generated audio is complete, and is composed of four
38
- chunks of audio, each corresponding to 250 tokens.
39
- This method of playing incremental generations reduces the latency of the MusicGen model from the total time to generate 1000 tokens,
40
- to the time taken to play the first chunk of audio (250 tokens). This can result in significant improvements to perceived latency,
41
- particularly when the chunk size is chosen to be small. In practice, the chunk size should be tuned to your device: using a
42
- smaller chunk size will mean that the first chunk is ready faster, but should not be chosen so small that the model generates slower
43
- than the time it takes to play the audio.
44
- For details on how the streaming class works, check out the source code for the [MusicgenStreamer](https://huggingface.co/spaces/sanchit-gandhi/musicgen-streaming/blob/main/app.py#L52).
45
  """
46
 
47
 
@@ -229,5 +211,4 @@ demo = gr.Interface(
229
  cache_examples=False,
230
  )
231
 
232
-
233
  demo.queue().launch()
 
1
+ import numpy as np
2
+ import torch
3
+ import gradio as gr
4
+ import spaces
5
  from queue import Queue
6
  from threading import Thread
7
  from typing import Optional
 
 
 
 
8
  from transformers import MusicgenForConditionalGeneration, MusicgenProcessor, set_seed
9
  from transformers.generation.streamers import BaseStreamer
10
 
 
 
 
 
11
  model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
12
  processor = MusicgenProcessor.from_pretrained("facebook/musicgen-small")
 
13
  title = "MusicGen Streaming"
 
14
  description = """
15
  Stream the outputs of the MusicGen text-to-music model by playing the generated audio as soon as the first chunk is ready.
16
  Demo uses [MusicGen Small](https://huggingface.co/facebook/musicgen-small) in the 🤗 Transformers library. Note that the
 
24
  frame rate of the [EnCodec model](https://huggingface.co/facebook/encodec_32khz) used to decode the generated codes to audio waveform,
25
  each set of generated audio codes corresponds to 0.02 seconds. This means we require a total of 1000 decoding steps to generate
26
  20 seconds of audio.
 
 
 
 
 
 
 
 
 
 
 
 
27
  """
28
 
29
 
 
211
  cache_examples=False,
212
  )
213
 
 
214
  demo.queue().launch()