Spaces:

fastrtc
/

whisper-realtime

Runtime error

App Files Files Community

freddyaboulton HF Staff commited on Apr 8

Commit

250897a

verified ·

1 Parent(s): 355b277

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

README.md +1 -1
app.py +20 -9
index.html +134 -26
requirements.txt +1 -1

README.md CHANGED Viewed

@@ -9,7 +9,7 @@ app_file: app.py
 pinned: false
 license: mit
 short_description: Transcribe audio in realtime with Whisper
-tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|GROQ_API_KEY]
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 pinned: false
 license: mit
 short_description: Transcribe audio in realtime with Whisper
+tags: [webrtc, websocket, gradio, secret|HF_TOKEN, secret|GROQ_API_KEY]
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -15,6 +15,7 @@ from fastrtc import (
 )
 from gradio.utils import get_space
 from groq import AsyncClient
 cur_dir = Path(__file__).parent
@@ -24,23 +25,23 @@ load_dotenv()
 groq_client = AsyncClient()
-async def transcribe(audio: tuple[int, np.ndarray]):
-    transcript = await groq_client.audio.transcriptions.create(
         file=("audio-file.mp3", audio_to_bytes(audio)),
         model="whisper-large-v3-turbo",
         response_format="verbose_json",
     )
-    yield AdditionalOutputs(transcript.text)
 stream = Stream(
     ReplyOnPause(transcribe),
     modality="audio",
     mode="send",
-    additional_outputs=[
-        gr.Textbox(label="Transcript"),
-    ],
-    additional_outputs_handler=lambda a, b: a + " " + b,
     rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
     concurrency_limit=5 if get_space() else None,
     time_limit=90 if get_space() else None,
@@ -51,11 +52,21 @@ app = FastAPI()
 stream.mount(app)
 @app.get("/transcript")
 def _(webrtc_id: str):
     async def output_stream():
         async for output in stream.output_stream(webrtc_id):
-            transcript = output.args[0]
             yield f"event: output\ndata: {transcript}\n\n"
     return StreamingResponse(output_stream(), media_type="text/event-stream")
@@ -73,7 +84,7 @@ if __name__ == "__main__":
     import os
     if (mode := os.getenv("MODE")) == "UI":
-        stream.ui.launch(server_port=7860, server_name="0.0.0.0")
     elif mode == "PHONE":
         stream.fastphone(host="0.0.0.0", port=7860)
     else:

 )
 from gradio.utils import get_space
 from groq import AsyncClient
+from pydantic import BaseModel
 cur_dir = Path(__file__).parent
 groq_client = AsyncClient()
+async def transcribe(audio: tuple[int, np.ndarray], transcript: str):
+    response = await groq_client.audio.transcriptions.create(
         file=("audio-file.mp3", audio_to_bytes(audio)),
         model="whisper-large-v3-turbo",
         response_format="verbose_json",
     )
+    yield AdditionalOutputs(transcript + "\n" + response.text)
+transcript = gr.Textbox(label="Transcript")
 stream = Stream(
     ReplyOnPause(transcribe),
     modality="audio",
     mode="send",
+    additional_inputs=[transcript],
+    additional_outputs=[transcript],
+    additional_outputs_handler=lambda a, b: b,
     rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
     concurrency_limit=5 if get_space() else None,
     time_limit=90 if get_space() else None,
 stream.mount(app)
+class SendInput(BaseModel):
+    webrtc_id: str
+    transcript: str
+@app.post("/send_input")
+def send_input(body: SendInput):
+    stream.set_input(body.webrtc_id, body.transcript)
 @app.get("/transcript")
 def _(webrtc_id: str):
     async def output_stream():
         async for output in stream.output_stream(webrtc_id):
+            transcript = output.args[0].split("\n")[-1]
             yield f"event: output\ndata: {transcript}\n\n"
     return StreamingResponse(output_stream(), media_type="text/event-stream")
     import os
     if (mode := os.getenv("MODE")) == "UI":
+        stream.ui.launch(server_port=7860)
     elif mode == "PHONE":
         stream.fastphone(host="0.0.0.0", port=7860)
     else:

index.html CHANGED Viewed

@@ -73,6 +73,8 @@
             transition: all 0.2s ease;
             font-weight: 500;
             min-width: 180px;
         }
         button:hover {
@@ -176,6 +178,40 @@
             transition: transform 0.1s ease;
         }
         @keyframes spin {
             to {
                 transform: rotate(360deg);
@@ -193,7 +229,8 @@
     </div>
     <div class="container">
-        <div class="transcript-container" id="transcript"></div>
         <div class="controls">
             <button id="start-button">Start Recording</button>
         </div>
@@ -205,10 +242,29 @@
         let audioContext, analyser, audioSource;
         let audioLevel = 0;
         let animationFrame;
         const startButton = document.getElementById('start-button');
         const transcriptDiv = document.getElementById('transcript');
         function showError(message) {
             const toast = document.getElementById('error-toast');
             toast.textContent = message;
@@ -220,35 +276,83 @@
             }, 5000);
         }
-        function handleMessage(event) {
             // Handle any WebRTC data channel messages if needed
             const eventJson = JSON.parse(event.data);
             if (eventJson.type === "error") {
                 showError(eventJson.message);
             }
             console.log('Received message:', event.data);
         }
         function updateButtonState() {
             if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
                 startButton.innerHTML = `
-                    <div class="icon-with-spinner">
-                        <div class="spinner"></div>
-                        <span>Connecting...</span>
                     </div>
                 `;
             } else if (peerConnection && peerConnection.connectionState === 'connected') {
                 startButton.innerHTML = `
-                    <div class="pulse-container">
-                        <div class="pulse-circle"></div>
-                        <span>Stop Recording</span>
                     </div>
                 `;
             } else {
                 startButton.innerHTML = 'Start Recording';
             }
         }
         function setupAudioVisualization(stream) {
             audioContext = new (window.AudioContext || window.webkitAudioContext)();
             analyser = audioContext.createAnalyser();
@@ -381,41 +485,45 @@
         function stop() {
             if (animationFrame) {
                 cancelAnimationFrame(animationFrame);
             }
             if (audioContext) {
-                audioContext.close();
                 audioContext = null;
                 analyser = null;
                 audioSource = null;
             }
             if (peerConnection) {
-                if (peerConnection.getTransceivers) {
-                    peerConnection.getTransceivers().forEach(transceiver => {
-                        if (transceiver.stop) {
-                            transceiver.stop();
-                        }
-                    });
-                }
                 if (peerConnection.getSenders) {
                     peerConnection.getSenders().forEach(sender => {
-                        if (sender.track && sender.track.stop) sender.track.stop();
                     });
                 }
-                setTimeout(() => {
-                    peerConnection.close();
-                }, 500);
             }
             audioLevel = 0;
             updateButtonState();
         }
-        startButton.addEventListener('click', () => {
-            if (startButton.textContent === 'Start Recording') {
-                setupWebRTC();
-            } else {
                 stop();
             }
         });
     </script>

             transition: all 0.2s ease;
             font-weight: 500;
             min-width: 180px;
+            position: relative;
+            padding-right: 50px;
         }
         button:hover {
             transition: transform 0.1s ease;
         }
+        /* Styles for the mute button */
+        .mute-toggle {
+            position: absolute;
+            right: 10px;
+            top: 50%;
+            transform: translateY(-50%);
+            width: 24px;
+            height: 24px;
+            cursor: pointer;
+            display: flex;
+            align-items: center;
+            justify-content: center;
+        }
+        .mute-toggle svg {
+            width: 20px;
+            height: 20px;
+            stroke: white;
+        }
+        /* Adjust layout for button content when mute is present */
+        .button-content {
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            width: calc(100% - 40px);
+            margin-right: 40px;
+        }
+        .icon-with-spinner,
+        .pulse-container {
+            width: 100%;
+        }
         @keyframes spin {
             to {
                 transform: rotate(360deg);
     </div>
     <div class="container">
+        <div class="transcript-container" id="transcript">
+        </div>
         <div class="controls">
             <button id="start-button">Start Recording</button>
         </div>
         let audioContext, analyser, audioSource;
         let audioLevel = 0;
         let animationFrame;
+        let isMuted = false;
         const startButton = document.getElementById('start-button');
         const transcriptDiv = document.getElementById('transcript');
+        // SVG Icons
+        const micIconSVG = `
+            <svg xmlns="http://www.w3.org/2000/svg" width="100%" height="100%" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
+                <path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
+                <path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
+                <line x1="12" y1="19" x2="12" y2="23"></line>
+                <line x1="8" y1="23" x2="16" y2="23"></line>
+            </svg>`;
+        const micMutedIconSVG = `
+            <svg xmlns="http://www.w3.org/2000/svg" width="100%" height="100%" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
+                <path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
+                <path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
+                <line x1="12" y1="19" x2="12" y2="23"></line>
+                <line x1="8" y1="23" x2="16" y2="23"></line>
+                <line x1="1" y1="1" x2="23" y2="23"></line>
+            </svg>`;
         function showError(message) {
             const toast = document.getElementById('error-toast');
             toast.textContent = message;
             }, 5000);
         }
+        async function handleMessage(event) {
             // Handle any WebRTC data channel messages if needed
             const eventJson = JSON.parse(event.data);
             if (eventJson.type === "error") {
                 showError(eventJson.message);
+            } else if (eventJson.type === "send_input") {
+                const response = await fetch('/send_input', {
+                    method: 'POST',
+                    headers: { 'Content-Type': 'application/json' },
+                    body: JSON.stringify({
+                        webrtc_id: webrtc_id,
+                        transcript: ""
+                    })
+                });
             }
             console.log('Received message:', event.data);
         }
         function updateButtonState() {
+            // Remove existing mute listener if present
+            const existingMuteButton = startButton.querySelector('.mute-toggle');
+            if (existingMuteButton) {
+                existingMuteButton.removeEventListener('click', toggleMute);
+                existingMuteButton.remove();
+            }
             if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
                 startButton.innerHTML = `
+                    <div class="button-content">
+                        <div class="icon-with-spinner">
+                            <div class="spinner"></div>
+                            <span>Connecting...</span>
+                        </div>
                     </div>
                 `;
+                startButton.disabled = true;
             } else if (peerConnection && peerConnection.connectionState === 'connected') {
                 startButton.innerHTML = `
+                    <div class="button-content">
+                        <div class="pulse-container">
+                            <div class="pulse-circle"></div>
+                            <span>Stop Recording</span>
+                        </div>
+                    </div>
+                    <div class="mute-toggle" title="${isMuted ? 'Unmute' : 'Mute'}">
+                        ${isMuted ? micMutedIconSVG : micIconSVG}
                     </div>
                 `;
+                startButton.disabled = false;
+                const muteButton = startButton.querySelector('.mute-toggle');
+                if (muteButton) {
+                    muteButton.addEventListener('click', toggleMute);
+                }
             } else {
                 startButton.innerHTML = 'Start Recording';
+                startButton.disabled = false;
             }
         }
+        function toggleMute(event) {
+            event.stopPropagation();
+            if (!peerConnection || peerConnection.connectionState !== 'connected') return;
+            isMuted = !isMuted;
+            console.log("Mute toggled:", isMuted);
+            peerConnection.getSenders().forEach(sender => {
+                if (sender.track && sender.track.kind === 'audio') {
+                    sender.track.enabled = !isMuted;
+                    console.log(`Audio track ${sender.track.id} enabled: ${!isMuted}`);
+                }
+            });
+            updateButtonState();
+        }
         function setupAudioVisualization(stream) {
             audioContext = new (window.AudioContext || window.webkitAudioContext)();
             analyser = audioContext.createAnalyser();
         function stop() {
             if (animationFrame) {
                 cancelAnimationFrame(animationFrame);
+                animationFrame = null;
             }
             if (audioContext) {
+                audioContext.close().catch(e => console.error("Error closing AudioContext:", e));
                 audioContext = null;
                 analyser = null;
                 audioSource = null;
             }
             if (peerConnection) {
                 if (peerConnection.getSenders) {
                     peerConnection.getSenders().forEach(sender => {
+                        if (sender.track) {
+                            sender.track.stop();
+                            console.log(`Track ${sender.track.id} stopped.`);
+                        }
                     });
                 }
+                peerConnection.close();
+                peerConnection = null;
+                console.log("Peer connection closed.");
             }
             audioLevel = 0;
+            isMuted = false;
             updateButtonState();
         }
+        startButton.addEventListener('click', (event) => {
+            if (event.target.closest('.mute-toggle')) {
+                return;
+            }
+            if (peerConnection && peerConnection.connectionState === 'connected') {
+                console.log("Stop button clicked");
                 stop();
+            } else if (!peerConnection || ['new', 'closed', 'failed', 'disconnected'].includes(peerConnection.connectionState)) {
+                console.log("Start button clicked");
+                transcriptDiv.innerHTML = '';
+                setupWebRTC();
+                updateButtonState();
             }
         });
     </script>

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-fastrtc[vad]
 groq
 python-dotenv
 twilio

+fastrtc[vad]==0.0.20.rc2
 groq
 python-dotenv
 twilio