Spaces:

Dmtlant
/

Image

Sleeping

App Files Files Community

Dmtlant commited on Nov 7, 2024

Commit

3a508f7

verified ·

1 Parent(s): 5feafa4

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -80

app.py CHANGED Viewed

@@ -1,95 +1,74 @@
 import streamlit as st
 import requests
-from io import BytesIO
 import base64
 API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large-v3-turbo"
-headers = {"Authorization": f"Bearer {st.secrets['HF_API_KEY']}"}
-def query(audio_bytes):
-    response = requests.post(API_URL, headers=headers, data=audio_bytes)
-    return response.json()
-st.title("Speech Recognition with Whisper")
-option = st.radio("Choose input method:", ('Upload File', 'Record from Microphone'))
-if option == 'Upload File':
-    uploaded_file = st.file_uploader("Choose an audio file", type=['wav', 'mp3', 'flac'])
-    if uploaded_file is not None:
-        st.audio(uploaded_file, format='audio/wav')
-        audio_bytes = uploaded_file.read()
-else:
-    st.write("Click the button below and allow microphone access to start recording")
-    # JavaScript to handle audio recording
-    js_code = """
-    var audioData = null;
-    var recorder = null;
-    var audioContext = null;
-    function startRecording() {
-        navigator.mediaDevices.getUserMedia({ audio: true })
-            .then(stream => {
-                audioContext = new AudioContext();
-                var input = audioContext.createMediaStreamSource(stream);
-                recorder = new Recorder(input);
-                recorder.record();
-                document.getElementById('startButton').style.display = 'none';
-                document.getElementById('stopButton').style.display = 'inline-block';
-            });
-    }
-    function stopRecording() {
-        recorder.stop();
-        document.getElementById('startButton').style.display = 'inline-block';
-        document.getElementById('stopButton').style.display = 'none';
-        recorder.exportWAV(function(blob) {
-            var reader = new FileReader();
-            reader.readAsDataURL(blob);
-            reader.onloadend = function() {
-                var base64data = reader.result;
-                audioData = base64data.split(',')[1];  // Remove the "data:audio/wav;base64," part
-                document.getElementById('audioData').value = audioData;
-                document.getElementById('submitButton').click();
-            }
-        });
-    }
-    """
-    # HTML for buttons
-    html_code = """
-    <script src="https://cdn.rawgit.com/mattdiamond/Recorderjs/08e7abd9/dist/recorder.js"></script>
-    <button id="startButton" onclick="startRecording()">Start Recording</button>
-    <button id="stopButton" style="display: none;" onclick="stopRecording()">Stop Recording</button>
-    <input type="hidden" id="audioData" name="audioData">
-    """
-    st.components.v1.html(html_code + f'<script>{js_code}</script>', height=100)
-    audio_data = st.text_input("Audio data", key="audioData", type="password")
-    submit_button = st.empty()
-    if submit_button.button("Submit", key="submitButton"):
-        if audio_data:
-            audio_bytes = base64.b64decode(audio_data)
-            st.audio(audio_bytes, format="audio/wav")
-        else:
-            st.warning("No audio recorded. Please record audio before submitting.")
-if 'audio_bytes' in locals():
-    if st.button('Transcribe'):
-        with st.spinner('Transcribing...'):
-            result = query(audio_bytes)
-            if 'text' in result:
-                st.success("Transcription completed!")
-                st.write("Transcribed text:")
-                st.write(result['text'])
-            else:
-                st.error("An error occurred during transcription.")
-                st.write("Error details:")
-                st.write(result)
 st.markdown("---")
-st.write("Note: This app uses the Whisper API from Hugging Face.")

 import streamlit as st
 import requests
+import asyncio
+import aiohttp
+from aiortc import MediaStreamTrack, RTCPeerConnection, RTCSessionDescription
+from aiortc.contrib.media import MediaBlackhole, MediaRecorder
+import av
 import base64
+import json
 API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large-v3-turbo"
+headers = {"Authorization": f"Bearer {st.secrets['hf_token']}"}
+class AudioTranscriber:
+    def __init__(self):
+        self.buffer = []
+        self.text = ""
+    async def transcribe(self, audio_data):
+        async with aiohttp.ClientSession() as session:
+            async with session.post(API_URL, headers=headers, data=audio_data) as response:
+                result = await response.json()
+                if 'text' in result:
+                    self.text += result['text'] + " "
+                    st.text_area("Transcription", self.text, height=200)
+class AudioTrack(MediaStreamTrack):
+    kind = "audio"
+    def __init__(self, track, transcriber):
+        super().__init__()
+        self.track = track
+        self.transcriber = transcriber
+    async def recv(self):
+        frame = await self.track.recv()
+        if len(self.transcriber.buffer) < 5:  # Collect 5 seconds of audio before transcribing
+            self.transcriber.buffer.append(frame.to_ndarray())
+        else:
+            audio_data = b''.join([av.AudioFrame.from_ndarray(buf).to_bytes() for buf in self.transcriber.buffer])
+            asyncio.create_task(self.transcriber.transcribe(audio_data))
+            self.transcriber.buffer = []
+        return frame
+async def process_offer(offer, transcriber):
+    pc = RTCPeerConnection()
+    pc.addTransceiver("audio", direction="recvonly")
+    @pc.on("track")
+    def on_track(track):
+        if track.kind == "audio":
+            pc.addTrack(AudioTrack(track, transcriber))
+    await pc.setRemoteDescription(RTCSessionDescription(sdp=offer["sdp"], type=offer["type"]))
+    answer = await pc.createAnswer()
+    await pc.setLocalDescription(answer)
+    return {"sdp": pc.localDescription.sdp, "type": pc.localDescription.type}
+st.title("Real-time Speech Recognition with Whisper")
+webrtc_ctx = st.config.get_option("server.enableXsrfProtection")
+if webrtc_ctx:
+    st.warning("To use this app, you need to disable XSRF protection. Set server.enableXsrfProtection=false in your Streamlit config.")
+else:
+    offer = st.text_input("Paste the offer SDP here")
+    if offer:
+        transcriber = AudioTranscriber()
+        answer = asyncio.run(process_offer(json.loads(offer), transcriber))
+        st.text_area("Answer SDP", json.dumps(answer))
+        st.write("Speak into your microphone. The transcription will appear below.")
 st.markdown("---")
+st.write("Note: This app uses the Whisper API from Hugging Face for real-time speech recognition.")