Jaward commited on
Commit
01023e5
·
verified ·
1 Parent(s): 2a37747

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -27
app.py CHANGED
@@ -10,6 +10,12 @@ import torch
10
  import random
11
  from openai import OpenAI
12
  import subprocess
 
 
 
 
 
 
13
 
14
  default_lang = "en"
15
 
@@ -112,18 +118,90 @@ def models(text, model="Llama 3 8B Service", seed=42):
112
 
113
  return output
114
 
115
- async def respond(audio, model, seed):
116
- if audio is None:
117
- return None
118
- user = transcribe(audio)
119
- if not user:
120
- return None
121
- reply = models(user, model, seed)
122
- communicate = edge_tts.Communicate(reply, voice="en-US-ChristopherNeural")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
124
  tmp_path = tmp_file.name
125
  await communicate.save(tmp_path)
126
- return tmp_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
  # Supported languages for seamless-expressive
129
  LANGUAGE_CODES = {
@@ -198,17 +276,21 @@ with gr.Blocks(css="style.css") as demo:
198
  value=0,
199
  visible=False
200
  )
201
- input = gr.Audio(label="User", sources=["microphone"], type="filepath")
202
- output = gr.Audio(label="AI", type="filepath",
203
- interactive=False,
204
- autoplay=True,
205
- elem_classes="audio")
206
 
207
- gr.Interface(
208
- fn=respond,
209
- inputs=[input, select, seed],
210
- outputs=[output],
211
- live=True
 
 
 
 
 
 
212
  )
213
 
214
  with gr.TabItem("Speech Translation") as speech_translation:
@@ -230,14 +312,6 @@ with gr.Blocks(css="style.css") as demo:
230
  live=True
231
  )
232
 
233
- # clear_button = gr.Button("Clear")
234
- # clear_button.click(
235
- # fn=clear_history,
236
- # inputs=[],
237
- # outputs=[input, output, input_audio, output_audio],
238
- # api_name="clear"
239
- # )
240
-
241
  voice_assistant.select(fn=voice_assistant_tab, inputs=None, outputs=description)
242
  speech_translation.select(fn=speech_translation_tab, inputs=None, outputs=description)
243
 
 
10
  import random
11
  from openai import OpenAI
12
  import subprocess
13
+ import threading
14
+ import queue
15
+ import sounddevice as sd
16
+ import numpy as np
17
+ import wave
18
+ import sys
19
 
20
  default_lang = "en"
21
 
 
118
 
119
  return output
120
 
121
+ # New global variables for audio processing
122
+ RATE = 16000
123
+ CHUNK = int(RATE / 10) # 100ms
124
+ audio_queue = queue.Queue()
125
+ is_listening = False
126
+
127
+ def audio_callback(indata, frames, time, status):
128
+ if status:
129
+ print(status, file=sys.stderr)
130
+ audio_queue.put(indata.copy())
131
+
132
+ def process_audio_stream(model, seed):
133
+ global is_listening
134
+ audio_buffer = []
135
+ silence_threshold = 0.01
136
+ silence_duration = 0
137
+ max_silence = 2 # seconds
138
+
139
+ while True:
140
+ if not is_listening:
141
+ audio_buffer.clear()
142
+ silence_duration = 0
143
+ audio_queue.queue.clear()
144
+ continue
145
+
146
+ try:
147
+ chunk = audio_queue.get(timeout=1)
148
+ audio_buffer.append(chunk)
149
+
150
+ # Check for silence
151
+ if np.abs(chunk).mean() < silence_threshold:
152
+ silence_duration += CHUNK / RATE
153
+ else:
154
+ silence_duration = 0
155
+
156
+ if silence_duration > max_silence:
157
+ # Process the buffered audio
158
+ audio_data = np.concatenate(audio_buffer)
159
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
160
+ tmp_path = tmp_file.name
161
+ with wave.open(tmp_path, 'wb') as wf:
162
+ wf.setnchannels(1)
163
+ wf.setsampwidth(2)
164
+ wf.setframerate(RATE)
165
+ wf.writeframes((audio_data * 32767).astype(np.int16).tobytes())
166
+
167
+ # Transcribe and process
168
+ user_input = transcribe(tmp_path)
169
+ if user_input:
170
+ is_listening = False
171
+ reply = models(user_input, model, seed)
172
+ asyncio.run(respond_and_play(reply))
173
+ is_listening = True
174
+
175
+ # Clear the buffer
176
+ audio_buffer.clear()
177
+ silence_duration = 0
178
+
179
+ except queue.Empty:
180
+ pass
181
+
182
+ async def respond_and_play(text):
183
+ communicate = edge_tts.Communicate(text, voice="en-US-ChristopherNeural")
184
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
185
  tmp_path = tmp_file.name
186
  await communicate.save(tmp_path)
187
+
188
+ # Play the audio
189
+ with wave.open(tmp_path, 'rb') as wf:
190
+ data = wf.readframes(wf.getnframes())
191
+ sd.play(np.frombuffer(data, dtype=np.int16), wf.getframerate())
192
+ sd.wait()
193
+
194
+ def start_listening(model, seed):
195
+ global is_listening
196
+ is_listening = True
197
+ threading.Thread(target=process_audio_stream, args=(model, seed), daemon=True).start()
198
+ with sd.InputStream(callback=audio_callback, channels=1, samplerate=RATE, blocksize=CHUNK):
199
+ while is_listening:
200
+ sd.sleep(100)
201
+
202
+ def stop_listening():
203
+ global is_listening
204
+ is_listening = False
205
 
206
  # Supported languages for seamless-expressive
207
  LANGUAGE_CODES = {
 
276
  value=0,
277
  visible=False
278
  )
279
+ start_button = gr.Button("Start Listening")
280
+ stop_button = gr.Button("Stop Listening")
281
+ status = gr.Markdown("Status: Not listening")
 
 
282
 
283
+ start_button.click(
284
+ fn=lambda model, seed: start_listening(model, seed),
285
+ inputs=[select, seed],
286
+ outputs=[status],
287
+ _js="() => {document.getElementById('status').textContent = 'Status: Listening'}"
288
+ )
289
+ stop_button.click(
290
+ fn=stop_listening,
291
+ inputs=[],
292
+ outputs=[status],
293
+ _js="() => {document.getElementById('status').textContent = 'Status: Not listening'}"
294
  )
295
 
296
  with gr.TabItem("Speech Translation") as speech_translation:
 
312
  live=True
313
  )
314
 
 
 
 
 
 
 
 
 
315
  voice_assistant.select(fn=voice_assistant_tab, inputs=None, outputs=description)
316
  speech_translation.select(fn=speech_translation_tab, inputs=None, outputs=description)
317