freddyaboulton HF Staff commited on
Commit
250897a
·
verified ·
1 Parent(s): 355b277

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. README.md +1 -1
  2. app.py +20 -9
  3. index.html +134 -26
  4. requirements.txt +1 -1
README.md CHANGED
@@ -9,7 +9,7 @@ app_file: app.py
9
  pinned: false
10
  license: mit
11
  short_description: Transcribe audio in realtime with Whisper
12
- tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|GROQ_API_KEY]
13
  ---
14
 
15
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
9
  pinned: false
10
  license: mit
11
  short_description: Transcribe audio in realtime with Whisper
12
+ tags: [webrtc, websocket, gradio, secret|HF_TOKEN, secret|GROQ_API_KEY]
13
  ---
14
 
15
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -15,6 +15,7 @@ from fastrtc import (
15
  )
16
  from gradio.utils import get_space
17
  from groq import AsyncClient
 
18
 
19
  cur_dir = Path(__file__).parent
20
 
@@ -24,23 +25,23 @@ load_dotenv()
24
  groq_client = AsyncClient()
25
 
26
 
27
- async def transcribe(audio: tuple[int, np.ndarray]):
28
- transcript = await groq_client.audio.transcriptions.create(
29
  file=("audio-file.mp3", audio_to_bytes(audio)),
30
  model="whisper-large-v3-turbo",
31
  response_format="verbose_json",
32
  )
33
- yield AdditionalOutputs(transcript.text)
34
 
35
 
 
36
  stream = Stream(
37
  ReplyOnPause(transcribe),
38
  modality="audio",
39
  mode="send",
40
- additional_outputs=[
41
- gr.Textbox(label="Transcript"),
42
- ],
43
- additional_outputs_handler=lambda a, b: a + " " + b,
44
  rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
45
  concurrency_limit=5 if get_space() else None,
46
  time_limit=90 if get_space() else None,
@@ -51,11 +52,21 @@ app = FastAPI()
51
  stream.mount(app)
52
 
53
 
 
 
 
 
 
 
 
 
 
 
54
  @app.get("/transcript")
55
  def _(webrtc_id: str):
56
  async def output_stream():
57
  async for output in stream.output_stream(webrtc_id):
58
- transcript = output.args[0]
59
  yield f"event: output\ndata: {transcript}\n\n"
60
 
61
  return StreamingResponse(output_stream(), media_type="text/event-stream")
@@ -73,7 +84,7 @@ if __name__ == "__main__":
73
  import os
74
 
75
  if (mode := os.getenv("MODE")) == "UI":
76
- stream.ui.launch(server_port=7860, server_name="0.0.0.0")
77
  elif mode == "PHONE":
78
  stream.fastphone(host="0.0.0.0", port=7860)
79
  else:
 
15
  )
16
  from gradio.utils import get_space
17
  from groq import AsyncClient
18
+ from pydantic import BaseModel
19
 
20
  cur_dir = Path(__file__).parent
21
 
 
25
  groq_client = AsyncClient()
26
 
27
 
28
+ async def transcribe(audio: tuple[int, np.ndarray], transcript: str):
29
+ response = await groq_client.audio.transcriptions.create(
30
  file=("audio-file.mp3", audio_to_bytes(audio)),
31
  model="whisper-large-v3-turbo",
32
  response_format="verbose_json",
33
  )
34
+ yield AdditionalOutputs(transcript + "\n" + response.text)
35
 
36
 
37
+ transcript = gr.Textbox(label="Transcript")
38
  stream = Stream(
39
  ReplyOnPause(transcribe),
40
  modality="audio",
41
  mode="send",
42
+ additional_inputs=[transcript],
43
+ additional_outputs=[transcript],
44
+ additional_outputs_handler=lambda a, b: b,
 
45
  rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
46
  concurrency_limit=5 if get_space() else None,
47
  time_limit=90 if get_space() else None,
 
52
  stream.mount(app)
53
 
54
 
55
+ class SendInput(BaseModel):
56
+ webrtc_id: str
57
+ transcript: str
58
+
59
+
60
+ @app.post("/send_input")
61
+ def send_input(body: SendInput):
62
+ stream.set_input(body.webrtc_id, body.transcript)
63
+
64
+
65
  @app.get("/transcript")
66
  def _(webrtc_id: str):
67
  async def output_stream():
68
  async for output in stream.output_stream(webrtc_id):
69
+ transcript = output.args[0].split("\n")[-1]
70
  yield f"event: output\ndata: {transcript}\n\n"
71
 
72
  return StreamingResponse(output_stream(), media_type="text/event-stream")
 
84
  import os
85
 
86
  if (mode := os.getenv("MODE")) == "UI":
87
+ stream.ui.launch(server_port=7860)
88
  elif mode == "PHONE":
89
  stream.fastphone(host="0.0.0.0", port=7860)
90
  else:
index.html CHANGED
@@ -73,6 +73,8 @@
73
  transition: all 0.2s ease;
74
  font-weight: 500;
75
  min-width: 180px;
 
 
76
  }
77
 
78
  button:hover {
@@ -176,6 +178,40 @@
176
  transition: transform 0.1s ease;
177
  }
178
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  @keyframes spin {
180
  to {
181
  transform: rotate(360deg);
@@ -193,7 +229,8 @@
193
  </div>
194
 
195
  <div class="container">
196
- <div class="transcript-container" id="transcript"></div>
 
197
  <div class="controls">
198
  <button id="start-button">Start Recording</button>
199
  </div>
@@ -205,10 +242,29 @@
205
  let audioContext, analyser, audioSource;
206
  let audioLevel = 0;
207
  let animationFrame;
 
208
 
209
  const startButton = document.getElementById('start-button');
210
  const transcriptDiv = document.getElementById('transcript');
211
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  function showError(message) {
213
  const toast = document.getElementById('error-toast');
214
  toast.textContent = message;
@@ -220,35 +276,83 @@
220
  }, 5000);
221
  }
222
 
223
- function handleMessage(event) {
224
  // Handle any WebRTC data channel messages if needed
225
  const eventJson = JSON.parse(event.data);
226
  if (eventJson.type === "error") {
227
  showError(eventJson.message);
 
 
 
 
 
 
 
 
 
228
  }
229
  console.log('Received message:', event.data);
 
230
  }
231
 
232
  function updateButtonState() {
 
 
 
 
 
 
 
233
  if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
234
  startButton.innerHTML = `
235
- <div class="icon-with-spinner">
236
- <div class="spinner"></div>
237
- <span>Connecting...</span>
 
 
238
  </div>
239
  `;
 
240
  } else if (peerConnection && peerConnection.connectionState === 'connected') {
241
  startButton.innerHTML = `
242
- <div class="pulse-container">
243
- <div class="pulse-circle"></div>
244
- <span>Stop Recording</span>
 
 
 
 
 
245
  </div>
246
  `;
 
 
 
 
 
247
  } else {
248
  startButton.innerHTML = 'Start Recording';
 
249
  }
250
  }
251
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  function setupAudioVisualization(stream) {
253
  audioContext = new (window.AudioContext || window.webkitAudioContext)();
254
  analyser = audioContext.createAnalyser();
@@ -381,41 +485,45 @@
381
  function stop() {
382
  if (animationFrame) {
383
  cancelAnimationFrame(animationFrame);
 
384
  }
385
  if (audioContext) {
386
- audioContext.close();
387
  audioContext = null;
388
  analyser = null;
389
  audioSource = null;
390
  }
391
  if (peerConnection) {
392
- if (peerConnection.getTransceivers) {
393
- peerConnection.getTransceivers().forEach(transceiver => {
394
- if (transceiver.stop) {
395
- transceiver.stop();
396
- }
397
- });
398
- }
399
-
400
  if (peerConnection.getSenders) {
401
  peerConnection.getSenders().forEach(sender => {
402
- if (sender.track && sender.track.stop) sender.track.stop();
 
 
 
403
  });
404
  }
405
-
406
- setTimeout(() => {
407
- peerConnection.close();
408
- }, 500);
409
  }
410
  audioLevel = 0;
 
411
  updateButtonState();
412
  }
413
 
414
- startButton.addEventListener('click', () => {
415
- if (startButton.textContent === 'Start Recording') {
416
- setupWebRTC();
417
- } else {
 
 
 
418
  stop();
 
 
 
 
 
419
  }
420
  });
421
  </script>
 
73
  transition: all 0.2s ease;
74
  font-weight: 500;
75
  min-width: 180px;
76
+ position: relative;
77
+ padding-right: 50px;
78
  }
79
 
80
  button:hover {
 
178
  transition: transform 0.1s ease;
179
  }
180
 
181
+ /* Styles for the mute button */
182
+ .mute-toggle {
183
+ position: absolute;
184
+ right: 10px;
185
+ top: 50%;
186
+ transform: translateY(-50%);
187
+ width: 24px;
188
+ height: 24px;
189
+ cursor: pointer;
190
+ display: flex;
191
+ align-items: center;
192
+ justify-content: center;
193
+ }
194
+
195
+ .mute-toggle svg {
196
+ width: 20px;
197
+ height: 20px;
198
+ stroke: white;
199
+ }
200
+
201
+ /* Adjust layout for button content when mute is present */
202
+ .button-content {
203
+ display: flex;
204
+ align-items: center;
205
+ justify-content: center;
206
+ width: calc(100% - 40px);
207
+ margin-right: 40px;
208
+ }
209
+
210
+ .icon-with-spinner,
211
+ .pulse-container {
212
+ width: 100%;
213
+ }
214
+
215
  @keyframes spin {
216
  to {
217
  transform: rotate(360deg);
 
229
  </div>
230
 
231
  <div class="container">
232
+ <div class="transcript-container" id="transcript">
233
+ </div>
234
  <div class="controls">
235
  <button id="start-button">Start Recording</button>
236
  </div>
 
242
  let audioContext, analyser, audioSource;
243
  let audioLevel = 0;
244
  let animationFrame;
245
+ let isMuted = false;
246
 
247
  const startButton = document.getElementById('start-button');
248
  const transcriptDiv = document.getElementById('transcript');
249
 
250
+ // SVG Icons
251
+ const micIconSVG = `
252
+ <svg xmlns="http://www.w3.org/2000/svg" width="100%" height="100%" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
253
+ <path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
254
+ <path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
255
+ <line x1="12" y1="19" x2="12" y2="23"></line>
256
+ <line x1="8" y1="23" x2="16" y2="23"></line>
257
+ </svg>`;
258
+
259
+ const micMutedIconSVG = `
260
+ <svg xmlns="http://www.w3.org/2000/svg" width="100%" height="100%" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
261
+ <path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
262
+ <path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
263
+ <line x1="12" y1="19" x2="12" y2="23"></line>
264
+ <line x1="8" y1="23" x2="16" y2="23"></line>
265
+ <line x1="1" y1="1" x2="23" y2="23"></line>
266
+ </svg>`;
267
+
268
  function showError(message) {
269
  const toast = document.getElementById('error-toast');
270
  toast.textContent = message;
 
276
  }, 5000);
277
  }
278
 
279
+ async function handleMessage(event) {
280
  // Handle any WebRTC data channel messages if needed
281
  const eventJson = JSON.parse(event.data);
282
  if (eventJson.type === "error") {
283
  showError(eventJson.message);
284
+ } else if (eventJson.type === "send_input") {
285
+ const response = await fetch('/send_input', {
286
+ method: 'POST',
287
+ headers: { 'Content-Type': 'application/json' },
288
+ body: JSON.stringify({
289
+ webrtc_id: webrtc_id,
290
+ transcript: ""
291
+ })
292
+ });
293
  }
294
  console.log('Received message:', event.data);
295
+
296
  }
297
 
298
  function updateButtonState() {
299
+ // Remove existing mute listener if present
300
+ const existingMuteButton = startButton.querySelector('.mute-toggle');
301
+ if (existingMuteButton) {
302
+ existingMuteButton.removeEventListener('click', toggleMute);
303
+ existingMuteButton.remove();
304
+ }
305
+
306
  if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
307
  startButton.innerHTML = `
308
+ <div class="button-content">
309
+ <div class="icon-with-spinner">
310
+ <div class="spinner"></div>
311
+ <span>Connecting...</span>
312
+ </div>
313
  </div>
314
  `;
315
+ startButton.disabled = true;
316
  } else if (peerConnection && peerConnection.connectionState === 'connected') {
317
  startButton.innerHTML = `
318
+ <div class="button-content">
319
+ <div class="pulse-container">
320
+ <div class="pulse-circle"></div>
321
+ <span>Stop Recording</span>
322
+ </div>
323
+ </div>
324
+ <div class="mute-toggle" title="${isMuted ? 'Unmute' : 'Mute'}">
325
+ ${isMuted ? micMutedIconSVG : micIconSVG}
326
  </div>
327
  `;
328
+ startButton.disabled = false;
329
+ const muteButton = startButton.querySelector('.mute-toggle');
330
+ if (muteButton) {
331
+ muteButton.addEventListener('click', toggleMute);
332
+ }
333
  } else {
334
  startButton.innerHTML = 'Start Recording';
335
+ startButton.disabled = false;
336
  }
337
  }
338
 
339
+ function toggleMute(event) {
340
+ event.stopPropagation();
341
+ if (!peerConnection || peerConnection.connectionState !== 'connected') return;
342
+
343
+ isMuted = !isMuted;
344
+ console.log("Mute toggled:", isMuted);
345
+
346
+ peerConnection.getSenders().forEach(sender => {
347
+ if (sender.track && sender.track.kind === 'audio') {
348
+ sender.track.enabled = !isMuted;
349
+ console.log(`Audio track ${sender.track.id} enabled: ${!isMuted}`);
350
+ }
351
+ });
352
+
353
+ updateButtonState();
354
+ }
355
+
356
  function setupAudioVisualization(stream) {
357
  audioContext = new (window.AudioContext || window.webkitAudioContext)();
358
  analyser = audioContext.createAnalyser();
 
485
  function stop() {
486
  if (animationFrame) {
487
  cancelAnimationFrame(animationFrame);
488
+ animationFrame = null;
489
  }
490
  if (audioContext) {
491
+ audioContext.close().catch(e => console.error("Error closing AudioContext:", e));
492
  audioContext = null;
493
  analyser = null;
494
  audioSource = null;
495
  }
496
  if (peerConnection) {
 
 
 
 
 
 
 
 
497
  if (peerConnection.getSenders) {
498
  peerConnection.getSenders().forEach(sender => {
499
+ if (sender.track) {
500
+ sender.track.stop();
501
+ console.log(`Track ${sender.track.id} stopped.`);
502
+ }
503
  });
504
  }
505
+ peerConnection.close();
506
+ peerConnection = null;
507
+ console.log("Peer connection closed.");
 
508
  }
509
  audioLevel = 0;
510
+ isMuted = false;
511
  updateButtonState();
512
  }
513
 
514
+ startButton.addEventListener('click', (event) => {
515
+ if (event.target.closest('.mute-toggle')) {
516
+ return;
517
+ }
518
+
519
+ if (peerConnection && peerConnection.connectionState === 'connected') {
520
+ console.log("Stop button clicked");
521
  stop();
522
+ } else if (!peerConnection || ['new', 'closed', 'failed', 'disconnected'].includes(peerConnection.connectionState)) {
523
+ console.log("Start button clicked");
524
+ transcriptDiv.innerHTML = '';
525
+ setupWebRTC();
526
+ updateButtonState();
527
  }
528
  });
529
  </script>
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- fastrtc[vad]
2
  groq
3
  python-dotenv
4
  twilio
 
1
+ fastrtc[vad]==0.0.20.rc2
2
  groq
3
  python-dotenv
4
  twilio