Update app.py
Browse files
app.py
CHANGED
@@ -1,95 +1,74 @@
|
|
1 |
import streamlit as st
|
2 |
import requests
|
3 |
-
|
|
|
|
|
|
|
|
|
4 |
import base64
|
|
|
5 |
|
6 |
API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large-v3-turbo"
|
7 |
-
headers = {"Authorization": f"Bearer {st.secrets['
|
8 |
|
9 |
-
|
10 |
-
|
11 |
-
|
|
|
12 |
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
-
|
|
|
16 |
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
var input = audioContext.createMediaStreamSource(stream);
|
36 |
-
recorder = new Recorder(input);
|
37 |
-
recorder.record();
|
38 |
-
document.getElementById('startButton').style.display = 'none';
|
39 |
-
document.getElementById('stopButton').style.display = 'inline-block';
|
40 |
-
});
|
41 |
-
}
|
42 |
-
|
43 |
-
function stopRecording() {
|
44 |
-
recorder.stop();
|
45 |
-
document.getElementById('startButton').style.display = 'inline-block';
|
46 |
-
document.getElementById('stopButton').style.display = 'none';
|
47 |
-
recorder.exportWAV(function(blob) {
|
48 |
-
var reader = new FileReader();
|
49 |
-
reader.readAsDataURL(blob);
|
50 |
-
reader.onloadend = function() {
|
51 |
-
var base64data = reader.result;
|
52 |
-
audioData = base64data.split(',')[1]; // Remove the "data:audio/wav;base64," part
|
53 |
-
document.getElementById('audioData').value = audioData;
|
54 |
-
document.getElementById('submitButton').click();
|
55 |
-
}
|
56 |
-
});
|
57 |
-
}
|
58 |
-
"""
|
59 |
-
|
60 |
-
# HTML for buttons
|
61 |
-
html_code = """
|
62 |
-
<script src="https://cdn.rawgit.com/mattdiamond/Recorderjs/08e7abd9/dist/recorder.js"></script>
|
63 |
-
<button id="startButton" onclick="startRecording()">Start Recording</button>
|
64 |
-
<button id="stopButton" style="display: none;" onclick="stopRecording()">Stop Recording</button>
|
65 |
-
<input type="hidden" id="audioData" name="audioData">
|
66 |
-
"""
|
67 |
|
68 |
-
|
|
|
|
|
|
|
69 |
|
70 |
-
|
71 |
-
|
|
|
72 |
|
73 |
-
|
74 |
-
if audio_data:
|
75 |
-
audio_bytes = base64.b64decode(audio_data)
|
76 |
-
st.audio(audio_bytes, format="audio/wav")
|
77 |
-
else:
|
78 |
-
st.warning("No audio recorded. Please record audio before submitting.")
|
79 |
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
st.write(result)
|
93 |
|
94 |
st.markdown("---")
|
95 |
-
st.write("Note: This app uses the Whisper API from Hugging Face.")
|
|
|
1 |
import streamlit as st
|
2 |
import requests
|
3 |
+
import asyncio
|
4 |
+
import aiohttp
|
5 |
+
from aiortc import MediaStreamTrack, RTCPeerConnection, RTCSessionDescription
|
6 |
+
from aiortc.contrib.media import MediaBlackhole, MediaRecorder
|
7 |
+
import av
|
8 |
import base64
|
9 |
+
import json
|
10 |
|
11 |
API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large-v3-turbo"
|
12 |
+
headers = {"Authorization": f"Bearer {st.secrets['hf_token']}"}
|
13 |
|
14 |
+
class AudioTranscriber:
|
15 |
+
def __init__(self):
|
16 |
+
self.buffer = []
|
17 |
+
self.text = ""
|
18 |
|
19 |
+
async def transcribe(self, audio_data):
|
20 |
+
async with aiohttp.ClientSession() as session:
|
21 |
+
async with session.post(API_URL, headers=headers, data=audio_data) as response:
|
22 |
+
result = await response.json()
|
23 |
+
if 'text' in result:
|
24 |
+
self.text += result['text'] + " "
|
25 |
+
st.text_area("Transcription", self.text, height=200)
|
26 |
|
27 |
+
class AudioTrack(MediaStreamTrack):
|
28 |
+
kind = "audio"
|
29 |
|
30 |
+
def __init__(self, track, transcriber):
|
31 |
+
super().__init__()
|
32 |
+
self.track = track
|
33 |
+
self.transcriber = transcriber
|
34 |
+
|
35 |
+
async def recv(self):
|
36 |
+
frame = await self.track.recv()
|
37 |
+
if len(self.transcriber.buffer) < 5: # Collect 5 seconds of audio before transcribing
|
38 |
+
self.transcriber.buffer.append(frame.to_ndarray())
|
39 |
+
else:
|
40 |
+
audio_data = b''.join([av.AudioFrame.from_ndarray(buf).to_bytes() for buf in self.transcriber.buffer])
|
41 |
+
asyncio.create_task(self.transcriber.transcribe(audio_data))
|
42 |
+
self.transcriber.buffer = []
|
43 |
+
return frame
|
44 |
+
|
45 |
+
async def process_offer(offer, transcriber):
|
46 |
+
pc = RTCPeerConnection()
|
47 |
+
pc.addTransceiver("audio", direction="recvonly")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
+
@pc.on("track")
|
50 |
+
def on_track(track):
|
51 |
+
if track.kind == "audio":
|
52 |
+
pc.addTrack(AudioTrack(track, transcriber))
|
53 |
|
54 |
+
await pc.setRemoteDescription(RTCSessionDescription(sdp=offer["sdp"], type=offer["type"]))
|
55 |
+
answer = await pc.createAnswer()
|
56 |
+
await pc.setLocalDescription(answer)
|
57 |
|
58 |
+
return {"sdp": pc.localDescription.sdp, "type": pc.localDescription.type}
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
+
st.title("Real-time Speech Recognition with Whisper")
|
61 |
+
|
62 |
+
webrtc_ctx = st.config.get_option("server.enableXsrfProtection")
|
63 |
+
if webrtc_ctx:
|
64 |
+
st.warning("To use this app, you need to disable XSRF protection. Set server.enableXsrfProtection=false in your Streamlit config.")
|
65 |
+
else:
|
66 |
+
offer = st.text_input("Paste the offer SDP here")
|
67 |
+
if offer:
|
68 |
+
transcriber = AudioTranscriber()
|
69 |
+
answer = asyncio.run(process_offer(json.loads(offer), transcriber))
|
70 |
+
st.text_area("Answer SDP", json.dumps(answer))
|
71 |
+
st.write("Speak into your microphone. The transcription will appear below.")
|
|
|
72 |
|
73 |
st.markdown("---")
|
74 |
+
st.write("Note: This app uses the Whisper API from Hugging Face for real-time speech recognition.")
|