Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Upload folder using huggingface_hub
Browse files- README.md +1 -1
- app.py +20 -9
- index.html +134 -26
- requirements.txt +1 -1
README.md
CHANGED
@@ -9,7 +9,7 @@ app_file: app.py
|
|
9 |
pinned: false
|
10 |
license: mit
|
11 |
short_description: Transcribe audio in realtime with Whisper
|
12 |
-
tags: [webrtc, websocket, gradio, secret|
|
13 |
---
|
14 |
|
15 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
9 |
pinned: false
|
10 |
license: mit
|
11 |
short_description: Transcribe audio in realtime with Whisper
|
12 |
+
tags: [webrtc, websocket, gradio, secret|HF_TOKEN, secret|GROQ_API_KEY]
|
13 |
---
|
14 |
|
15 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
@@ -15,6 +15,7 @@ from fastrtc import (
|
|
15 |
)
|
16 |
from gradio.utils import get_space
|
17 |
from groq import AsyncClient
|
|
|
18 |
|
19 |
cur_dir = Path(__file__).parent
|
20 |
|
@@ -24,23 +25,23 @@ load_dotenv()
|
|
24 |
groq_client = AsyncClient()
|
25 |
|
26 |
|
27 |
-
async def transcribe(audio: tuple[int, np.ndarray]):
|
28 |
-
|
29 |
file=("audio-file.mp3", audio_to_bytes(audio)),
|
30 |
model="whisper-large-v3-turbo",
|
31 |
response_format="verbose_json",
|
32 |
)
|
33 |
-
yield AdditionalOutputs(transcript.text)
|
34 |
|
35 |
|
|
|
36 |
stream = Stream(
|
37 |
ReplyOnPause(transcribe),
|
38 |
modality="audio",
|
39 |
mode="send",
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
additional_outputs_handler=lambda a, b: a + " " + b,
|
44 |
rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
|
45 |
concurrency_limit=5 if get_space() else None,
|
46 |
time_limit=90 if get_space() else None,
|
@@ -51,11 +52,21 @@ app = FastAPI()
|
|
51 |
stream.mount(app)
|
52 |
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
@app.get("/transcript")
|
55 |
def _(webrtc_id: str):
|
56 |
async def output_stream():
|
57 |
async for output in stream.output_stream(webrtc_id):
|
58 |
-
transcript = output.args[0]
|
59 |
yield f"event: output\ndata: {transcript}\n\n"
|
60 |
|
61 |
return StreamingResponse(output_stream(), media_type="text/event-stream")
|
@@ -73,7 +84,7 @@ if __name__ == "__main__":
|
|
73 |
import os
|
74 |
|
75 |
if (mode := os.getenv("MODE")) == "UI":
|
76 |
-
stream.ui.launch(server_port=7860
|
77 |
elif mode == "PHONE":
|
78 |
stream.fastphone(host="0.0.0.0", port=7860)
|
79 |
else:
|
|
|
15 |
)
|
16 |
from gradio.utils import get_space
|
17 |
from groq import AsyncClient
|
18 |
+
from pydantic import BaseModel
|
19 |
|
20 |
cur_dir = Path(__file__).parent
|
21 |
|
|
|
25 |
groq_client = AsyncClient()
|
26 |
|
27 |
|
28 |
+
async def transcribe(audio: tuple[int, np.ndarray], transcript: str):
|
29 |
+
response = await groq_client.audio.transcriptions.create(
|
30 |
file=("audio-file.mp3", audio_to_bytes(audio)),
|
31 |
model="whisper-large-v3-turbo",
|
32 |
response_format="verbose_json",
|
33 |
)
|
34 |
+
yield AdditionalOutputs(transcript + "\n" + response.text)
|
35 |
|
36 |
|
37 |
+
transcript = gr.Textbox(label="Transcript")
|
38 |
stream = Stream(
|
39 |
ReplyOnPause(transcribe),
|
40 |
modality="audio",
|
41 |
mode="send",
|
42 |
+
additional_inputs=[transcript],
|
43 |
+
additional_outputs=[transcript],
|
44 |
+
additional_outputs_handler=lambda a, b: b,
|
|
|
45 |
rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
|
46 |
concurrency_limit=5 if get_space() else None,
|
47 |
time_limit=90 if get_space() else None,
|
|
|
52 |
stream.mount(app)
|
53 |
|
54 |
|
55 |
+
class SendInput(BaseModel):
|
56 |
+
webrtc_id: str
|
57 |
+
transcript: str
|
58 |
+
|
59 |
+
|
60 |
+
@app.post("/send_input")
|
61 |
+
def send_input(body: SendInput):
|
62 |
+
stream.set_input(body.webrtc_id, body.transcript)
|
63 |
+
|
64 |
+
|
65 |
@app.get("/transcript")
|
66 |
def _(webrtc_id: str):
|
67 |
async def output_stream():
|
68 |
async for output in stream.output_stream(webrtc_id):
|
69 |
+
transcript = output.args[0].split("\n")[-1]
|
70 |
yield f"event: output\ndata: {transcript}\n\n"
|
71 |
|
72 |
return StreamingResponse(output_stream(), media_type="text/event-stream")
|
|
|
84 |
import os
|
85 |
|
86 |
if (mode := os.getenv("MODE")) == "UI":
|
87 |
+
stream.ui.launch(server_port=7860)
|
88 |
elif mode == "PHONE":
|
89 |
stream.fastphone(host="0.0.0.0", port=7860)
|
90 |
else:
|
index.html
CHANGED
@@ -73,6 +73,8 @@
|
|
73 |
transition: all 0.2s ease;
|
74 |
font-weight: 500;
|
75 |
min-width: 180px;
|
|
|
|
|
76 |
}
|
77 |
|
78 |
button:hover {
|
@@ -176,6 +178,40 @@
|
|
176 |
transition: transform 0.1s ease;
|
177 |
}
|
178 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
@keyframes spin {
|
180 |
to {
|
181 |
transform: rotate(360deg);
|
@@ -193,7 +229,8 @@
|
|
193 |
</div>
|
194 |
|
195 |
<div class="container">
|
196 |
-
<div class="transcript-container" id="transcript"
|
|
|
197 |
<div class="controls">
|
198 |
<button id="start-button">Start Recording</button>
|
199 |
</div>
|
@@ -205,10 +242,29 @@
|
|
205 |
let audioContext, analyser, audioSource;
|
206 |
let audioLevel = 0;
|
207 |
let animationFrame;
|
|
|
208 |
|
209 |
const startButton = document.getElementById('start-button');
|
210 |
const transcriptDiv = document.getElementById('transcript');
|
211 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
function showError(message) {
|
213 |
const toast = document.getElementById('error-toast');
|
214 |
toast.textContent = message;
|
@@ -220,35 +276,83 @@
|
|
220 |
}, 5000);
|
221 |
}
|
222 |
|
223 |
-
function handleMessage(event) {
|
224 |
// Handle any WebRTC data channel messages if needed
|
225 |
const eventJson = JSON.parse(event.data);
|
226 |
if (eventJson.type === "error") {
|
227 |
showError(eventJson.message);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
}
|
229 |
console.log('Received message:', event.data);
|
|
|
230 |
}
|
231 |
|
232 |
function updateButtonState() {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
233 |
if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
|
234 |
startButton.innerHTML = `
|
235 |
-
<div class="
|
236 |
-
<div class="spinner"
|
237 |
-
|
|
|
|
|
238 |
</div>
|
239 |
`;
|
|
|
240 |
} else if (peerConnection && peerConnection.connectionState === 'connected') {
|
241 |
startButton.innerHTML = `
|
242 |
-
<div class="
|
243 |
-
<div class="pulse-
|
244 |
-
|
|
|
|
|
|
|
|
|
|
|
245 |
</div>
|
246 |
`;
|
|
|
|
|
|
|
|
|
|
|
247 |
} else {
|
248 |
startButton.innerHTML = 'Start Recording';
|
|
|
249 |
}
|
250 |
}
|
251 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
252 |
function setupAudioVisualization(stream) {
|
253 |
audioContext = new (window.AudioContext || window.webkitAudioContext)();
|
254 |
analyser = audioContext.createAnalyser();
|
@@ -381,41 +485,45 @@
|
|
381 |
function stop() {
|
382 |
if (animationFrame) {
|
383 |
cancelAnimationFrame(animationFrame);
|
|
|
384 |
}
|
385 |
if (audioContext) {
|
386 |
-
audioContext.close();
|
387 |
audioContext = null;
|
388 |
analyser = null;
|
389 |
audioSource = null;
|
390 |
}
|
391 |
if (peerConnection) {
|
392 |
-
if (peerConnection.getTransceivers) {
|
393 |
-
peerConnection.getTransceivers().forEach(transceiver => {
|
394 |
-
if (transceiver.stop) {
|
395 |
-
transceiver.stop();
|
396 |
-
}
|
397 |
-
});
|
398 |
-
}
|
399 |
-
|
400 |
if (peerConnection.getSenders) {
|
401 |
peerConnection.getSenders().forEach(sender => {
|
402 |
-
if (sender.track
|
|
|
|
|
|
|
403 |
});
|
404 |
}
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
}, 500);
|
409 |
}
|
410 |
audioLevel = 0;
|
|
|
411 |
updateButtonState();
|
412 |
}
|
413 |
|
414 |
-
startButton.addEventListener('click', () => {
|
415 |
-
if (
|
416 |
-
|
417 |
-
}
|
|
|
|
|
|
|
418 |
stop();
|
|
|
|
|
|
|
|
|
|
|
419 |
}
|
420 |
});
|
421 |
</script>
|
|
|
73 |
transition: all 0.2s ease;
|
74 |
font-weight: 500;
|
75 |
min-width: 180px;
|
76 |
+
position: relative;
|
77 |
+
padding-right: 50px;
|
78 |
}
|
79 |
|
80 |
button:hover {
|
|
|
178 |
transition: transform 0.1s ease;
|
179 |
}
|
180 |
|
181 |
+
/* Styles for the mute button */
|
182 |
+
.mute-toggle {
|
183 |
+
position: absolute;
|
184 |
+
right: 10px;
|
185 |
+
top: 50%;
|
186 |
+
transform: translateY(-50%);
|
187 |
+
width: 24px;
|
188 |
+
height: 24px;
|
189 |
+
cursor: pointer;
|
190 |
+
display: flex;
|
191 |
+
align-items: center;
|
192 |
+
justify-content: center;
|
193 |
+
}
|
194 |
+
|
195 |
+
.mute-toggle svg {
|
196 |
+
width: 20px;
|
197 |
+
height: 20px;
|
198 |
+
stroke: white;
|
199 |
+
}
|
200 |
+
|
201 |
+
/* Adjust layout for button content when mute is present */
|
202 |
+
.button-content {
|
203 |
+
display: flex;
|
204 |
+
align-items: center;
|
205 |
+
justify-content: center;
|
206 |
+
width: calc(100% - 40px);
|
207 |
+
margin-right: 40px;
|
208 |
+
}
|
209 |
+
|
210 |
+
.icon-with-spinner,
|
211 |
+
.pulse-container {
|
212 |
+
width: 100%;
|
213 |
+
}
|
214 |
+
|
215 |
@keyframes spin {
|
216 |
to {
|
217 |
transform: rotate(360deg);
|
|
|
229 |
</div>
|
230 |
|
231 |
<div class="container">
|
232 |
+
<div class="transcript-container" id="transcript">
|
233 |
+
</div>
|
234 |
<div class="controls">
|
235 |
<button id="start-button">Start Recording</button>
|
236 |
</div>
|
|
|
242 |
let audioContext, analyser, audioSource;
|
243 |
let audioLevel = 0;
|
244 |
let animationFrame;
|
245 |
+
let isMuted = false;
|
246 |
|
247 |
const startButton = document.getElementById('start-button');
|
248 |
const transcriptDiv = document.getElementById('transcript');
|
249 |
|
250 |
+
// SVG Icons
|
251 |
+
const micIconSVG = `
|
252 |
+
<svg xmlns="http://www.w3.org/2000/svg" width="100%" height="100%" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
253 |
+
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
|
254 |
+
<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
|
255 |
+
<line x1="12" y1="19" x2="12" y2="23"></line>
|
256 |
+
<line x1="8" y1="23" x2="16" y2="23"></line>
|
257 |
+
</svg>`;
|
258 |
+
|
259 |
+
const micMutedIconSVG = `
|
260 |
+
<svg xmlns="http://www.w3.org/2000/svg" width="100%" height="100%" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
261 |
+
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
|
262 |
+
<path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
|
263 |
+
<line x1="12" y1="19" x2="12" y2="23"></line>
|
264 |
+
<line x1="8" y1="23" x2="16" y2="23"></line>
|
265 |
+
<line x1="1" y1="1" x2="23" y2="23"></line>
|
266 |
+
</svg>`;
|
267 |
+
|
268 |
function showError(message) {
|
269 |
const toast = document.getElementById('error-toast');
|
270 |
toast.textContent = message;
|
|
|
276 |
}, 5000);
|
277 |
}
|
278 |
|
279 |
+
async function handleMessage(event) {
|
280 |
// Handle any WebRTC data channel messages if needed
|
281 |
const eventJson = JSON.parse(event.data);
|
282 |
if (eventJson.type === "error") {
|
283 |
showError(eventJson.message);
|
284 |
+
} else if (eventJson.type === "send_input") {
|
285 |
+
const response = await fetch('/send_input', {
|
286 |
+
method: 'POST',
|
287 |
+
headers: { 'Content-Type': 'application/json' },
|
288 |
+
body: JSON.stringify({
|
289 |
+
webrtc_id: webrtc_id,
|
290 |
+
transcript: ""
|
291 |
+
})
|
292 |
+
});
|
293 |
}
|
294 |
console.log('Received message:', event.data);
|
295 |
+
|
296 |
}
|
297 |
|
298 |
function updateButtonState() {
|
299 |
+
// Remove existing mute listener if present
|
300 |
+
const existingMuteButton = startButton.querySelector('.mute-toggle');
|
301 |
+
if (existingMuteButton) {
|
302 |
+
existingMuteButton.removeEventListener('click', toggleMute);
|
303 |
+
existingMuteButton.remove();
|
304 |
+
}
|
305 |
+
|
306 |
if (peerConnection && (peerConnection.connectionState === 'connecting' || peerConnection.connectionState === 'new')) {
|
307 |
startButton.innerHTML = `
|
308 |
+
<div class="button-content">
|
309 |
+
<div class="icon-with-spinner">
|
310 |
+
<div class="spinner"></div>
|
311 |
+
<span>Connecting...</span>
|
312 |
+
</div>
|
313 |
</div>
|
314 |
`;
|
315 |
+
startButton.disabled = true;
|
316 |
} else if (peerConnection && peerConnection.connectionState === 'connected') {
|
317 |
startButton.innerHTML = `
|
318 |
+
<div class="button-content">
|
319 |
+
<div class="pulse-container">
|
320 |
+
<div class="pulse-circle"></div>
|
321 |
+
<span>Stop Recording</span>
|
322 |
+
</div>
|
323 |
+
</div>
|
324 |
+
<div class="mute-toggle" title="${isMuted ? 'Unmute' : 'Mute'}">
|
325 |
+
${isMuted ? micMutedIconSVG : micIconSVG}
|
326 |
</div>
|
327 |
`;
|
328 |
+
startButton.disabled = false;
|
329 |
+
const muteButton = startButton.querySelector('.mute-toggle');
|
330 |
+
if (muteButton) {
|
331 |
+
muteButton.addEventListener('click', toggleMute);
|
332 |
+
}
|
333 |
} else {
|
334 |
startButton.innerHTML = 'Start Recording';
|
335 |
+
startButton.disabled = false;
|
336 |
}
|
337 |
}
|
338 |
|
339 |
+
function toggleMute(event) {
|
340 |
+
event.stopPropagation();
|
341 |
+
if (!peerConnection || peerConnection.connectionState !== 'connected') return;
|
342 |
+
|
343 |
+
isMuted = !isMuted;
|
344 |
+
console.log("Mute toggled:", isMuted);
|
345 |
+
|
346 |
+
peerConnection.getSenders().forEach(sender => {
|
347 |
+
if (sender.track && sender.track.kind === 'audio') {
|
348 |
+
sender.track.enabled = !isMuted;
|
349 |
+
console.log(`Audio track ${sender.track.id} enabled: ${!isMuted}`);
|
350 |
+
}
|
351 |
+
});
|
352 |
+
|
353 |
+
updateButtonState();
|
354 |
+
}
|
355 |
+
|
356 |
function setupAudioVisualization(stream) {
|
357 |
audioContext = new (window.AudioContext || window.webkitAudioContext)();
|
358 |
analyser = audioContext.createAnalyser();
|
|
|
485 |
function stop() {
|
486 |
if (animationFrame) {
|
487 |
cancelAnimationFrame(animationFrame);
|
488 |
+
animationFrame = null;
|
489 |
}
|
490 |
if (audioContext) {
|
491 |
+
audioContext.close().catch(e => console.error("Error closing AudioContext:", e));
|
492 |
audioContext = null;
|
493 |
analyser = null;
|
494 |
audioSource = null;
|
495 |
}
|
496 |
if (peerConnection) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
497 |
if (peerConnection.getSenders) {
|
498 |
peerConnection.getSenders().forEach(sender => {
|
499 |
+
if (sender.track) {
|
500 |
+
sender.track.stop();
|
501 |
+
console.log(`Track ${sender.track.id} stopped.`);
|
502 |
+
}
|
503 |
});
|
504 |
}
|
505 |
+
peerConnection.close();
|
506 |
+
peerConnection = null;
|
507 |
+
console.log("Peer connection closed.");
|
|
|
508 |
}
|
509 |
audioLevel = 0;
|
510 |
+
isMuted = false;
|
511 |
updateButtonState();
|
512 |
}
|
513 |
|
514 |
+
startButton.addEventListener('click', (event) => {
|
515 |
+
if (event.target.closest('.mute-toggle')) {
|
516 |
+
return;
|
517 |
+
}
|
518 |
+
|
519 |
+
if (peerConnection && peerConnection.connectionState === 'connected') {
|
520 |
+
console.log("Stop button clicked");
|
521 |
stop();
|
522 |
+
} else if (!peerConnection || ['new', 'closed', 'failed', 'disconnected'].includes(peerConnection.connectionState)) {
|
523 |
+
console.log("Start button clicked");
|
524 |
+
transcriptDiv.innerHTML = '';
|
525 |
+
setupWebRTC();
|
526 |
+
updateButtonState();
|
527 |
}
|
528 |
});
|
529 |
</script>
|
requirements.txt
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
fastrtc[vad]
|
2 |
groq
|
3 |
python-dotenv
|
4 |
twilio
|
|
|
1 |
+
fastrtc[vad]==0.0.20.rc2
|
2 |
groq
|
3 |
python-dotenv
|
4 |
twilio
|