PoTaTo721 SpicyqSama007 commited on
Commit
5bea296
1 Parent(s): f9b6bed

Streaming agent (#2)

Browse files

- Streaming agent (7a0182abff42b8a3d933f516ca07d974e72152ac)
- Update app.py (5defd8e3d168c6bdb8f9a0296f0b61f22e59340e)


Co-authored-by: anya <[email protected]>

Files changed (1) hide show
  1. app.py +25 -15
app.py CHANGED
@@ -2,6 +2,8 @@ import re
2
  import gradio as gr
3
  import numpy as np
4
  import os
 
 
5
  import threading
6
  import subprocess
7
  import sys
@@ -52,6 +54,17 @@ class ChatState:
52
  def clear_fn():
53
  return [], ChatState(), None, None, None
54
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  async def process_audio_input(
57
  sys_audio_input, sys_text_input, audio_input, state: ChatState, text_input: str
@@ -72,11 +85,9 @@ async def process_audio_input(
72
 
73
  if isinstance(sys_audio_input, tuple):
74
  sr, sys_audio_data = sys_audio_input
75
- elif text_input:
76
  sr = 44100
77
  sys_audio_data = None
78
- else:
79
- raise gr.Error("Invalid audio format")
80
 
81
  def append_to_chat_ctx(
82
  part: ServeTextPart | ServeVQPart, role: str = "assistant"
@@ -106,22 +117,16 @@ async def process_audio_input(
106
  ):
107
  if event.type == FishE2EEventType.USER_CODES:
108
  append_to_chat_ctx(ServeVQPart(codes=event.vq_codes), role="user")
 
109
  elif event.type == FishE2EEventType.SPEECH_SEGMENT:
110
- result_audio += event.frame.data
111
- np_audio = np.frombuffer(result_audio, dtype=np.int16)
112
  append_to_chat_ctx(ServeVQPart(codes=event.vq_codes))
113
-
114
- yield state.get_history(), (44100, np_audio), None, None
115
  elif event.type == FishE2EEventType.TEXT_SEGMENT:
116
  append_to_chat_ctx(ServeTextPart(text=event.text))
117
- if result_audio:
118
- np_audio = np.frombuffer(result_audio, dtype=np.int16)
119
- yield state.get_history(), (44100, np_audio), None, None
120
- else:
121
- yield state.get_history(), None, None, None
122
 
123
- np_audio = np.frombuffer(result_audio, dtype=np.int16)
124
- yield state.get_history(), (44100, np_audio), None, None
125
 
126
 
127
  async def process_text_input(
@@ -179,7 +184,12 @@ def create_demo():
179
 
180
  text_input = gr.Textbox(label="Or type your message", type="text",value="Can you give a brief introduction of yourself?")
181
 
182
- output_audio = gr.Audio(label="Assistant's Voice", type="numpy")
 
 
 
 
 
183
 
184
  send_button = gr.Button("Send", variant="primary")
185
  clear_button = gr.Button("Clear")
 
2
  import gradio as gr
3
  import numpy as np
4
  import os
5
+ import io
6
+ import wave
7
  import threading
8
  import subprocess
9
  import sys
 
54
  def clear_fn():
55
  return [], ChatState(), None, None, None
56
 
57
+ def wav_chunk_header(sample_rate=44100, bit_depth=16, channels=1):
58
+ buffer = io.BytesIO()
59
+
60
+ with wave.open(buffer, "wb") as wav_file:
61
+ wav_file.setnchannels(channels)
62
+ wav_file.setsampwidth(bit_depth // 8)
63
+ wav_file.setframerate(sample_rate)
64
+
65
+ wav_header_bytes = buffer.getvalue()
66
+ buffer.close()
67
+ return wav_header_bytes
68
 
69
  async def process_audio_input(
70
  sys_audio_input, sys_text_input, audio_input, state: ChatState, text_input: str
 
85
 
86
  if isinstance(sys_audio_input, tuple):
87
  sr, sys_audio_data = sys_audio_input
88
+ else:
89
  sr = 44100
90
  sys_audio_data = None
 
 
91
 
92
  def append_to_chat_ctx(
93
  part: ServeTextPart | ServeVQPart, role: str = "assistant"
 
117
  ):
118
  if event.type == FishE2EEventType.USER_CODES:
119
  append_to_chat_ctx(ServeVQPart(codes=event.vq_codes), role="user")
120
+
121
  elif event.type == FishE2EEventType.SPEECH_SEGMENT:
 
 
122
  append_to_chat_ctx(ServeVQPart(codes=event.vq_codes))
123
+ yield state.get_history(), wav_chunk_header() + event.frame.data, None, None
124
+
125
  elif event.type == FishE2EEventType.TEXT_SEGMENT:
126
  append_to_chat_ctx(ServeTextPart(text=event.text))
127
+ yield state.get_history(), None, None, None
 
 
 
 
128
 
129
+ yield state.get_history(), None, None, None
 
130
 
131
 
132
  async def process_text_input(
 
184
 
185
  text_input = gr.Textbox(label="Or type your message", type="text",value="Can you give a brief introduction of yourself?")
186
 
187
+ output_audio = gr.Audio(
188
+ label="Assistant's Voice",
189
+ streaming=True,
190
+ autoplay=True,
191
+ interactive=False,
192
+ )
193
 
194
  send_button = gr.Button("Send", variant="primary")
195
  clear_button = gr.Button("Clear")