longform-musicgen

Running on Zero

App Files Files Community

ylacombe commited on Apr 22, 2024

Commit

b4777d5

verified ·

1 Parent(s): 2ab4d19

Update app.py

Browse files

Files changed (1) hide show

app.py +6 -13

app.py CHANGED Viewed

@@ -508,9 +508,6 @@ class MusicgenStreamer(BaseStreamer):
         self.to_yield = 0
         self.is_longform = is_longform
-        if is_longform:
-            self.longform_stride = model.stride_longform
-            self.longform_stride_applied = True
         # varibles used in the thread process
         self.audio_queue = Queue()
@@ -564,15 +561,13 @@ class MusicgenStreamer(BaseStreamer):
         if self.token_cache is None:
             self.token_cache = value
         else:
-            # if self.is_longform and not self.longform_stride_applied:
-            #     value = value[self.longform_stride:]
-            #     self.longform_stride_applied = True
             self.token_cache = torch.concatenate([self.token_cache, value[:, None]], dim=-1)
         if self.token_cache.shape[-1] % self.play_steps == 0:
             audio_values = self.apply_delay_pattern_mask(self.token_cache)
-            self.on_finalized_audio(audio_values[self.to_yield : -self.stride])
-            self.to_yield += len(audio_values) - self.to_yield - self.stride
     def end(self, stream_end=False):
         """Flushes any remaining cache and appends the stop symbol."""
@@ -582,8 +577,6 @@ class MusicgenStreamer(BaseStreamer):
             audio_values = np.zeros(self.to_yield)
         stream_end = (not self.is_longform) or stream_end
-        if self.is_longform:
-            self.longform_stride_applied = False
         self.on_finalized_audio(audio_values[self.to_yield :], stream_end=stream_end)
     def on_finalized_audio(self, audio: np.ndarray, stream_end: bool = False):
@@ -656,13 +649,13 @@ def generate_audio(text_prompt, audio, audio_length_in_s=10.0, play_steps_in_s=2
             return_tensors="pt",
         )
-    streamer = MusicgenStreamer(model, device=device, play_steps=play_steps, is_longform=True, )
     generation_kwargs = dict(
         **inputs.to(device),
         temperature=1.2,
         streamer=streamer,
-        max_new_tokens=min(max_new_tokens, 1500),
         max_longform_generation_length=max_new_tokens,
     )
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
@@ -685,7 +678,7 @@ demo = gr.Interface(
     inputs=[
         gr.Text(label="Prompt", value="80s pop track with synth and instrumentals"),
         gr.Audio(type="filepath", label="Conditioning audio. Use this for melody-guided generation."),
-        gr.Slider(35, 60, value=45, step=5, label="(Approximate) Audio length in seconds."),
         gr.Slider(0.5, 2.5, value=1.5, step=0.5, label="Streaming interval in seconds.", info="Lower = shorter chunks, lower latency, more codec steps."),
         gr.Number(value=5, precision=0, step=1, minimum=0, label="Seed for random generations."),
     ],

         self.to_yield = 0
         self.is_longform = is_longform
         # varibles used in the thread process
         self.audio_queue = Queue()
         if self.token_cache is None:
             self.token_cache = value
         else:
             self.token_cache = torch.concatenate([self.token_cache, value[:, None]], dim=-1)
         if self.token_cache.shape[-1] % self.play_steps == 0:
             audio_values = self.apply_delay_pattern_mask(self.token_cache)
+            if self.to_yield != len(audio_values) - self.stride:
+                self.on_finalized_audio(audio_values[self.to_yield : -self.stride])
+                self.to_yield += len(audio_values) - self.to_yield - self.stride
     def end(self, stream_end=False):
         """Flushes any remaining cache and appends the stop symbol."""
             audio_values = np.zeros(self.to_yield)
         stream_end = (not self.is_longform) or stream_end
         self.on_finalized_audio(audio_values[self.to_yield :], stream_end=stream_end)
     def on_finalized_audio(self, audio: np.ndarray, stream_end: bool = False):
             return_tensors="pt",
         )
+    streamer = MusicgenStreamer(model, device=device, play_steps=play_steps, is_longform=True, stride=1)
     generation_kwargs = dict(
         **inputs.to(device),
         temperature=1.2,
         streamer=streamer,
+        max_new_tokens=min(max_new_tokens, 1503),
         max_longform_generation_length=max_new_tokens,
     )
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     inputs=[
         gr.Text(label="Prompt", value="80s pop track with synth and instrumentals"),
         gr.Audio(type="filepath", label="Conditioning audio. Use this for melody-guided generation."),
+        gr.Slider(30, 60, value=45, step=5, label="(Approximate) Audio length in seconds."),
         gr.Slider(0.5, 2.5, value=1.5, step=0.5, label="Streaming interval in seconds.", info="Lower = shorter chunks, lower latency, more codec steps."),
         gr.Number(value=5, precision=0, step=1, minimum=0, label="Seed for random generations."),
     ],