Spaces:

fishaudio
/

fish-agent

Running on L40S

App Files Files Community

PoTaTo721

SpicyqSama007 commited on Nov 6, 2024

Commit

5bea296

verified ·

1 Parent(s): f9b6bed

Streaming agent (#2)

Browse files

- Streaming agent (7a0182abff42b8a3d933f516ca07d974e72152ac)
- Update app.py (5defd8e3d168c6bdb8f9a0296f0b61f22e59340e)

Co-authored-by: anya <[email protected]>

Files changed (1) hide show

app.py +25 -15

app.py CHANGED Viewed

@@ -2,6 +2,8 @@ import re
 import gradio as gr
 import numpy as np
 import os
 import threading
 import subprocess
 import sys
@@ -52,6 +54,17 @@ class ChatState:
 def clear_fn():
     return [], ChatState(), None, None, None
 async def process_audio_input(
     sys_audio_input, sys_text_input, audio_input, state: ChatState, text_input: str
@@ -72,11 +85,9 @@ async def process_audio_input(
     if isinstance(sys_audio_input, tuple):
         sr, sys_audio_data = sys_audio_input
-    elif text_input:
         sr = 44100
         sys_audio_data = None
-    else:
-        raise gr.Error("Invalid audio format")
     def append_to_chat_ctx(
         part: ServeTextPart | ServeVQPart, role: str = "assistant"
@@ -106,22 +117,16 @@ async def process_audio_input(
     ):
         if event.type == FishE2EEventType.USER_CODES:
             append_to_chat_ctx(ServeVQPart(codes=event.vq_codes), role="user")
         elif event.type == FishE2EEventType.SPEECH_SEGMENT:
-            result_audio += event.frame.data
-            np_audio = np.frombuffer(result_audio, dtype=np.int16)
             append_to_chat_ctx(ServeVQPart(codes=event.vq_codes))
-            yield state.get_history(), (44100, np_audio), None, None
         elif event.type == FishE2EEventType.TEXT_SEGMENT:
             append_to_chat_ctx(ServeTextPart(text=event.text))
-            if result_audio:
-                np_audio = np.frombuffer(result_audio, dtype=np.int16)
-                yield state.get_history(), (44100, np_audio), None, None
-            else:
-                yield state.get_history(), None, None, None
-    np_audio = np.frombuffer(result_audio, dtype=np.int16)
-    yield state.get_history(), (44100, np_audio), None, None
 async def process_text_input(
@@ -179,7 +184,12 @@ def create_demo():
                 text_input = gr.Textbox(label="Or type your message", type="text",value="Can you give a brief introduction of yourself?")
-                output_audio = gr.Audio(label="Assistant's Voice", type="numpy")
                 send_button = gr.Button("Send", variant="primary")
                 clear_button = gr.Button("Clear")

 import gradio as gr
 import numpy as np
 import os
+import io
+import wave
 import threading
 import subprocess
 import sys
 def clear_fn():
     return [], ChatState(), None, None, None
+def wav_chunk_header(sample_rate=44100, bit_depth=16, channels=1):
+    buffer = io.BytesIO()
+    with wave.open(buffer, "wb") as wav_file:
+        wav_file.setnchannels(channels)
+        wav_file.setsampwidth(bit_depth // 8)
+        wav_file.setframerate(sample_rate)
+    wav_header_bytes = buffer.getvalue()
+    buffer.close()
+    return wav_header_bytes
 async def process_audio_input(
     sys_audio_input, sys_text_input, audio_input, state: ChatState, text_input: str
     if isinstance(sys_audio_input, tuple):
         sr, sys_audio_data = sys_audio_input
+    else:
         sr = 44100
         sys_audio_data = None
     def append_to_chat_ctx(
         part: ServeTextPart | ServeVQPart, role: str = "assistant"
     ):
         if event.type == FishE2EEventType.USER_CODES:
             append_to_chat_ctx(ServeVQPart(codes=event.vq_codes), role="user")
         elif event.type == FishE2EEventType.SPEECH_SEGMENT:
             append_to_chat_ctx(ServeVQPart(codes=event.vq_codes))
+            yield state.get_history(), wav_chunk_header() + event.frame.data, None, None
         elif event.type == FishE2EEventType.TEXT_SEGMENT:
             append_to_chat_ctx(ServeTextPart(text=event.text))
+            yield state.get_history(), None, None, None
+    yield state.get_history(), None, None, None
 async def process_text_input(
                 text_input = gr.Textbox(label="Or type your message", type="text",value="Can you give a brief introduction of yourself?")
+                output_audio = gr.Audio(
+                    label="Assistant's Voice",
+                    streaming=True,
+                    autoplay=True,
+                    interactive=False,
+                )
                 send_button = gr.Button("Send", variant="primary")
                 clear_button = gr.Button("Clear")