Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -30,7 +30,7 @@ import gradio as gr
|
|
30 |
import librosa
|
31 |
import numpy as np
|
32 |
import requests
|
33 |
-
from gradio_webrtc import
|
34 |
from huggingface_hub import snapshot_download
|
35 |
from pydub import AudioSegment
|
36 |
from twilio.rest import Client
|
@@ -67,102 +67,13 @@ if account_sid and auth_token:
|
|
67 |
else:
|
68 |
rtc_configuration = None
|
69 |
|
70 |
-
# recording parameters
|
71 |
-
IN_CHANNELS = 1
|
72 |
-
IN_RATE = 24000
|
73 |
-
IN_CHUNK = 1024
|
74 |
-
IN_SAMPLE_WIDTH = 2
|
75 |
-
VAD_STRIDE = 0.5
|
76 |
-
|
77 |
-
# playing parameters
|
78 |
OUT_CHANNELS = 1
|
79 |
OUT_RATE = 24000
|
80 |
OUT_SAMPLE_WIDTH = 2
|
81 |
OUT_CHUNK = 20 * 4096
|
82 |
|
83 |
|
84 |
-
def
|
85 |
-
_st = time.time()
|
86 |
-
try:
|
87 |
-
audio = ori_audio
|
88 |
-
audio = audio.astype(np.float32) / 32768.0
|
89 |
-
sampling_rate = 16000
|
90 |
-
if sr != sampling_rate:
|
91 |
-
audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
|
92 |
-
|
93 |
-
vad_parameters = {}
|
94 |
-
vad_parameters = VadOptions(**vad_parameters)
|
95 |
-
speech_chunks = get_speech_timestamps(audio, vad_parameters)
|
96 |
-
audio = collect_chunks(audio, speech_chunks)
|
97 |
-
duration_after_vad = audio.shape[0] / sampling_rate
|
98 |
-
|
99 |
-
if sr != sampling_rate:
|
100 |
-
# resample to original sampling rate
|
101 |
-
vad_audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=sr)
|
102 |
-
else:
|
103 |
-
vad_audio = audio
|
104 |
-
vad_audio = np.round(vad_audio * 32768.0).astype(np.int16)
|
105 |
-
vad_audio_bytes = vad_audio.tobytes()
|
106 |
-
|
107 |
-
return duration_after_vad, vad_audio_bytes, round(time.time() - _st, 4)
|
108 |
-
except Exception as e:
|
109 |
-
msg = f"[asr vad error] audio_len: {len(ori_audio)/(sr*2):.3f} s, trace: {traceback.format_exc()}"
|
110 |
-
print(msg)
|
111 |
-
return -1, ori_audio, round(time.time() - _st, 4)
|
112 |
-
|
113 |
-
|
114 |
-
def warm_up():
|
115 |
-
frames = np.zeros((1, 1600)) # 1024 frames of 2 bytes each
|
116 |
-
_, frames, tcost = run_vad(frames, 16000)
|
117 |
-
print(f"warm up done, time_cost: {tcost:.3f} s")
|
118 |
-
|
119 |
-
|
120 |
-
# warm_up()
|
121 |
-
|
122 |
-
|
123 |
-
@dataclass
|
124 |
-
class AppState:
|
125 |
-
stream: np.ndarray | None = None
|
126 |
-
sampling_rate: int = 0
|
127 |
-
pause_detected: bool = False
|
128 |
-
started_talking: bool = False
|
129 |
-
responding: bool = False
|
130 |
-
stopped: bool = False
|
131 |
-
buffer: np.ndarray | None = None
|
132 |
-
|
133 |
-
|
134 |
-
def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
|
135 |
-
"""Take in the stream, determine if a pause happened"""
|
136 |
-
duration = len(audio) / sampling_rate
|
137 |
-
|
138 |
-
dur_vad, _, _ = run_vad(audio, sampling_rate)
|
139 |
-
|
140 |
-
if duration >= 0.60:
|
141 |
-
if dur_vad > 0.2 and not state.started_talking:
|
142 |
-
print("started talking")
|
143 |
-
state.started_talking = True
|
144 |
-
if state.started_talking:
|
145 |
-
if state.stream is None:
|
146 |
-
state.stream = audio
|
147 |
-
else:
|
148 |
-
state.stream = np.concatenate((state.stream, audio))
|
149 |
-
state.buffer = None
|
150 |
-
if dur_vad < 0.1 and state.started_talking:
|
151 |
-
segment = AudioSegment(
|
152 |
-
state.stream.tobytes(),
|
153 |
-
frame_rate=sampling_rate,
|
154 |
-
sample_width=audio.dtype.itemsize,
|
155 |
-
channels=(1 if len(state.stream.shape) == 1 else state.stream.shape[1]),
|
156 |
-
)
|
157 |
-
|
158 |
-
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
159 |
-
segment.export(f.name, format="wav")
|
160 |
-
print("input file written", f.name)
|
161 |
-
return True
|
162 |
-
return False
|
163 |
-
|
164 |
-
|
165 |
-
def speaking(audio_bytes: str):
|
166 |
base64_encoded = str(base64.b64encode(audio_bytes), encoding="utf-8")
|
167 |
files = {"audio": base64_encoded}
|
168 |
byte_buffer = b""
|
@@ -194,73 +105,24 @@ def speaking(audio_bytes: str):
|
|
194 |
raise gr.Error(f"Error during audio streaming: {e}")
|
195 |
|
196 |
|
197 |
-
def process_audio(audio: tuple, state: AppState) -> None:
|
198 |
-
frame_rate, array = audio
|
199 |
-
array = np.squeeze(array)
|
200 |
-
if not state.sampling_rate:
|
201 |
-
state.sampling_rate = frame_rate
|
202 |
-
if state.buffer is None:
|
203 |
-
state.buffer = array
|
204 |
-
else:
|
205 |
-
state.buffer = np.concatenate((state.buffer, array))
|
206 |
-
|
207 |
-
pause_detected = determine_pause(state.buffer, state.sampling_rate, state)
|
208 |
-
state.pause_detected = pause_detected
|
209 |
-
|
210 |
|
211 |
-
def response(
|
212 |
-
|
213 |
-
|
214 |
|
215 |
audio_buffer = io.BytesIO()
|
216 |
segment = AudioSegment(
|
217 |
-
|
218 |
-
frame_rate=
|
219 |
-
sample_width=
|
220 |
-
channels=
|
221 |
-
|
222 |
segment.export(audio_buffer, format="wav")
|
223 |
|
224 |
for numpy_array in speaking(audio_buffer.getvalue()):
|
225 |
yield (OUT_RATE, numpy_array, "mono")
|
226 |
|
227 |
|
228 |
-
class OmniHandler(StreamHandler):
|
229 |
-
def __init__(self) -> None:
|
230 |
-
super().__init__(
|
231 |
-
expected_layout="mono", output_sample_rate=OUT_RATE, output_frame_size=480
|
232 |
-
)
|
233 |
-
self.event = Event()
|
234 |
-
self.state = AppState()
|
235 |
-
self.generator = None
|
236 |
-
self.duration = 0
|
237 |
-
|
238 |
-
def receive(self, frame: tuple[int, np.ndarray]) -> None:
|
239 |
-
if self.state.responding:
|
240 |
-
return
|
241 |
-
process_audio(frame, self.state)
|
242 |
-
if self.state.pause_detected:
|
243 |
-
self.event.set()
|
244 |
-
|
245 |
-
def reset(self):
|
246 |
-
self.generator = None
|
247 |
-
self.event.clear()
|
248 |
-
self.state = AppState()
|
249 |
-
self.duration = 0
|
250 |
-
|
251 |
-
def emit(self):
|
252 |
-
if not self.event.is_set():
|
253 |
-
return None
|
254 |
-
else:
|
255 |
-
if not self.generator:
|
256 |
-
self.generator = response(self.state)
|
257 |
-
self.state.responding = True
|
258 |
-
try:
|
259 |
-
return next(self.generator)
|
260 |
-
except StopIteration:
|
261 |
-
self.reset()
|
262 |
-
|
263 |
-
|
264 |
with gr.Blocks() as demo:
|
265 |
gr.HTML(
|
266 |
"""
|
@@ -277,7 +139,7 @@ with gr.Blocks() as demo:
|
|
277 |
mode="send-receive",
|
278 |
modality="audio",
|
279 |
)
|
280 |
-
audio.stream(fn=
|
281 |
|
282 |
|
283 |
demo.launch(ssr_mode=False)
|
|
|
30 |
import librosa
|
31 |
import numpy as np
|
32 |
import requests
|
33 |
+
from gradio_webrtc import ReplyOnPause, WebRTC
|
34 |
from huggingface_hub import snapshot_download
|
35 |
from pydub import AudioSegment
|
36 |
from twilio.rest import Client
|
|
|
67 |
else:
|
68 |
rtc_configuration = None
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
OUT_CHANNELS = 1
|
71 |
OUT_RATE = 24000
|
72 |
OUT_SAMPLE_WIDTH = 2
|
73 |
OUT_CHUNK = 20 * 4096
|
74 |
|
75 |
|
76 |
+
def speaking(audio_bytes: bytes):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
base64_encoded = str(base64.b64encode(audio_bytes), encoding="utf-8")
|
78 |
files = {"audio": base64_encoded}
|
79 |
byte_buffer = b""
|
|
|
105 |
raise gr.Error(f"Error during audio streaming: {e}")
|
106 |
|
107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
+
def response(audio: tuple[int, np.ndarray]):
|
110 |
+
sampling_rate, audio_np = audio
|
111 |
+
audio_np = audio_np.squeeze()
|
112 |
|
113 |
audio_buffer = io.BytesIO()
|
114 |
segment = AudioSegment(
|
115 |
+
audio_np.tobytes(),
|
116 |
+
frame_rate=sampling_rate,
|
117 |
+
sample_width=audio_np.dtype.itemsize,
|
118 |
+
channels=1)
|
119 |
+
|
120 |
segment.export(audio_buffer, format="wav")
|
121 |
|
122 |
for numpy_array in speaking(audio_buffer.getvalue()):
|
123 |
yield (OUT_RATE, numpy_array, "mono")
|
124 |
|
125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
with gr.Blocks() as demo:
|
127 |
gr.HTML(
|
128 |
"""
|
|
|
139 |
mode="send-receive",
|
140 |
modality="audio",
|
141 |
)
|
142 |
+
audio.stream(fn=ReplyOnPause(response), inputs=[audio], outputs=[audio], time_limit=60)
|
143 |
|
144 |
|
145 |
demo.launch(ssr_mode=False)
|