Spaces:

sesame
/

csm-1b

Running on Zero

App Files Files Community

Zackh commited on Mar 13

Commit

d794e1d

1 Parent(s): 1499d36

seq length

Browse files

Files changed (2) hide show

app.py +25 -0
generator.py +4 -0

app.py CHANGED Viewed

@@ -112,6 +112,29 @@ def infer(
     audio_prompt_speaker_a,
     audio_prompt_speaker_b,
     gen_conversation_input,
 ) -> tuple[np.ndarray, int]:
     audio_prompt_a = prepare_prompt(text_prompt_speaker_a, 0, audio_prompt_speaker_a)
     audio_prompt_b = prepare_prompt(text_prompt_speaker_b, 1, audio_prompt_speaker_b)
@@ -128,6 +151,7 @@ def infer(
             text=line,
             speaker=speaker_id,
             context=prompt_segments + generated_segments,
         )
         generated_segments.append(Segment(text=line, speaker=speaker_id, audio=audio_tensor))
@@ -215,6 +239,7 @@ with gr.Blocks() as app:
     gen_conversation_input = gr.TextArea(label="conversation", lines=20, value=DEFAULT_CONVERSATION)
     generate_btn = gr.Button("Generate conversation", variant="primary")
     audio_output = gr.Audio(label="Synthesized audio")
     generate_btn.click(

     audio_prompt_speaker_a,
     audio_prompt_speaker_b,
     gen_conversation_input,
+) -> tuple[np.ndarray, int]:
+    # Estimate token limit, otherwise failure might happen after many utterances have been generated.
+    if len(gen_conversation_input.strip() + text_prompt_speaker_a.strip() + text_prompt_speaker_b.strip()) >= 2000:
+        raise gr.Error("Prompts and conversation too long.", duration=30)
+    try:
+        return _infer(
+            text_prompt_speaker_a,
+            text_prompt_speaker_b,
+            audio_prompt_speaker_a,
+            audio_prompt_speaker_b,
+            gen_conversation_input,
+        )
+    except ValueError as e:
+        raise gr.Error(f"Error generating audio: {e}", duration=120)
+def _infer(
+    text_prompt_speaker_a,
+    text_prompt_speaker_b,
+    audio_prompt_speaker_a,
+    audio_prompt_speaker_b,
+    gen_conversation_input,
 ) -> tuple[np.ndarray, int]:
     audio_prompt_a = prepare_prompt(text_prompt_speaker_a, 0, audio_prompt_speaker_a)
     audio_prompt_b = prepare_prompt(text_prompt_speaker_b, 1, audio_prompt_speaker_b)
             text=line,
             speaker=speaker_id,
             context=prompt_segments + generated_segments,
+            max_audio_length_ms=30_000,
         )
         generated_segments.append(Segment(text=line, speaker=speaker_id, audio=audio_tensor))
     gen_conversation_input = gr.TextArea(label="conversation", lines=20, value=DEFAULT_CONVERSATION)
     generate_btn = gr.Button("Generate conversation", variant="primary")
+    gr.Markdown("GPU time limited to 3 minutes, for longer usage duplicate the space.")
     audio_output = gr.Audio(label="Synthesized audio")
     generate_btn.click(

generator.py CHANGED Viewed

@@ -137,6 +137,10 @@ class Generator:
         curr_tokens_mask = prompt_tokens_mask.unsqueeze(0)
         curr_pos = torch.arange(0, prompt_tokens.size(0)).unsqueeze(0).long().to(self.device)
         for _ in range(max_audio_frames):
             sample = self._model.generate_frame(curr_tokens, curr_tokens_mask, curr_pos, temperature, topk)
             if torch.all(sample == 0):

         curr_tokens_mask = prompt_tokens_mask.unsqueeze(0)
         curr_pos = torch.arange(0, prompt_tokens.size(0)).unsqueeze(0).long().to(self.device)
+        max_seq_len = 2048 - max_audio_frames
+        if curr_tokens.size(1) >= max_seq_len:
+            raise ValueError(f"Inputs too long, must be below max_seq_len - max_audio_frames: {max_seq_len}")
         for _ in range(max_audio_frames):
             sample = self._model.generate_frame(curr_tokens, curr_tokens_mask, curr_pos, temperature, topk)
             if torch.all(sample == 0):