import streamlit as st import requests import asyncio import aiohttp from aiortc import MediaStreamTrack, RTCPeerConnection, RTCSessionDescription from aiortc.contrib.media import MediaBlackhole, MediaRecorder import av import base64 import json API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large-v3-turbo" headers = {"Authorization": f"Bearer {st.secrets['hf_token']}"} class AudioTranscriber: def __init__(self): self.buffer = [] self.text = "" async def transcribe(self, audio_data): async with aiohttp.ClientSession() as session: async with session.post(API_URL, headers=headers, data=audio_data) as response: result = await response.json() if 'text' in result: self.text += result['text'] + " " st.text_area("Transcription", self.text, height=200) class AudioTrack(MediaStreamTrack): kind = "audio" def __init__(self, track, transcriber): super().__init__() self.track = track self.transcriber = transcriber async def recv(self): frame = await self.track.recv() if len(self.transcriber.buffer) < 5: # Collect 5 seconds of audio before transcribing self.transcriber.buffer.append(frame.to_ndarray()) else: audio_data = b''.join([av.AudioFrame.from_ndarray(buf).to_bytes() for buf in self.transcriber.buffer]) asyncio.create_task(self.transcriber.transcribe(audio_data)) self.transcriber.buffer = [] return frame async def process_offer(offer, transcriber): pc = RTCPeerConnection() pc.addTransceiver("audio", direction="recvonly") @pc.on("track") def on_track(track): if track.kind == "audio": pc.addTrack(AudioTrack(track, transcriber)) await pc.setRemoteDescription(RTCSessionDescription(sdp=offer["sdp"], type=offer["type"])) answer = await pc.createAnswer() await pc.setLocalDescription(answer) return {"sdp": pc.localDescription.sdp, "type": pc.localDescription.type} st.title("Real-time Speech Recognition with Whisper") webrtc_ctx = st.config.get_option("server.enableXsrfProtection") if webrtc_ctx: st.warning("To use this app, you need to disable XSRF protection. Set server.enableXsrfProtection=false in your Streamlit config.") else: offer = st.text_input("Paste the offer SDP here") if offer: transcriber = AudioTranscriber() answer = asyncio.run(process_offer(json.loads(offer), transcriber)) st.text_area("Answer SDP", json.dumps(answer)) st.write("Speak into your microphone. The transcription will appear below.") st.markdown("---") st.write("Note: This app uses the Whisper API from Hugging Face for real-time speech recognition.")