Dmtlant commited on
Commit
3a508f7
·
verified ·
1 Parent(s): 5feafa4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -80
app.py CHANGED
@@ -1,95 +1,74 @@
1
  import streamlit as st
2
  import requests
3
- from io import BytesIO
 
 
 
 
4
  import base64
 
5
 
6
  API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large-v3-turbo"
7
- headers = {"Authorization": f"Bearer {st.secrets['HF_API_KEY']}"}
8
 
9
- def query(audio_bytes):
10
- response = requests.post(API_URL, headers=headers, data=audio_bytes)
11
- return response.json()
 
12
 
13
- st.title("Speech Recognition with Whisper")
 
 
 
 
 
 
14
 
15
- option = st.radio("Choose input method:", ('Upload File', 'Record from Microphone'))
 
16
 
17
- if option == 'Upload File':
18
- uploaded_file = st.file_uploader("Choose an audio file", type=['wav', 'mp3', 'flac'])
19
- if uploaded_file is not None:
20
- st.audio(uploaded_file, format='audio/wav')
21
- audio_bytes = uploaded_file.read()
22
- else:
23
- st.write("Click the button below and allow microphone access to start recording")
24
-
25
- # JavaScript to handle audio recording
26
- js_code = """
27
- var audioData = null;
28
- var recorder = null;
29
- var audioContext = null;
30
-
31
- function startRecording() {
32
- navigator.mediaDevices.getUserMedia({ audio: true })
33
- .then(stream => {
34
- audioContext = new AudioContext();
35
- var input = audioContext.createMediaStreamSource(stream);
36
- recorder = new Recorder(input);
37
- recorder.record();
38
- document.getElementById('startButton').style.display = 'none';
39
- document.getElementById('stopButton').style.display = 'inline-block';
40
- });
41
- }
42
-
43
- function stopRecording() {
44
- recorder.stop();
45
- document.getElementById('startButton').style.display = 'inline-block';
46
- document.getElementById('stopButton').style.display = 'none';
47
- recorder.exportWAV(function(blob) {
48
- var reader = new FileReader();
49
- reader.readAsDataURL(blob);
50
- reader.onloadend = function() {
51
- var base64data = reader.result;
52
- audioData = base64data.split(',')[1]; // Remove the "data:audio/wav;base64," part
53
- document.getElementById('audioData').value = audioData;
54
- document.getElementById('submitButton').click();
55
- }
56
- });
57
- }
58
- """
59
-
60
- # HTML for buttons
61
- html_code = """
62
- <script src="https://cdn.rawgit.com/mattdiamond/Recorderjs/08e7abd9/dist/recorder.js"></script>
63
- <button id="startButton" onclick="startRecording()">Start Recording</button>
64
- <button id="stopButton" style="display: none;" onclick="stopRecording()">Stop Recording</button>
65
- <input type="hidden" id="audioData" name="audioData">
66
- """
67
 
68
- st.components.v1.html(html_code + f'<script>{js_code}</script>', height=100)
 
 
 
69
 
70
- audio_data = st.text_input("Audio data", key="audioData", type="password")
71
- submit_button = st.empty()
 
72
 
73
- if submit_button.button("Submit", key="submitButton"):
74
- if audio_data:
75
- audio_bytes = base64.b64decode(audio_data)
76
- st.audio(audio_bytes, format="audio/wav")
77
- else:
78
- st.warning("No audio recorded. Please record audio before submitting.")
79
 
80
- if 'audio_bytes' in locals():
81
- if st.button('Transcribe'):
82
- with st.spinner('Transcribing...'):
83
- result = query(audio_bytes)
84
-
85
- if 'text' in result:
86
- st.success("Transcription completed!")
87
- st.write("Transcribed text:")
88
- st.write(result['text'])
89
- else:
90
- st.error("An error occurred during transcription.")
91
- st.write("Error details:")
92
- st.write(result)
93
 
94
  st.markdown("---")
95
- st.write("Note: This app uses the Whisper API from Hugging Face.")
 
1
  import streamlit as st
2
  import requests
3
+ import asyncio
4
+ import aiohttp
5
+ from aiortc import MediaStreamTrack, RTCPeerConnection, RTCSessionDescription
6
+ from aiortc.contrib.media import MediaBlackhole, MediaRecorder
7
+ import av
8
  import base64
9
+ import json
10
 
11
  API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large-v3-turbo"
12
+ headers = {"Authorization": f"Bearer {st.secrets['hf_token']}"}
13
 
14
+ class AudioTranscriber:
15
+ def __init__(self):
16
+ self.buffer = []
17
+ self.text = ""
18
 
19
+ async def transcribe(self, audio_data):
20
+ async with aiohttp.ClientSession() as session:
21
+ async with session.post(API_URL, headers=headers, data=audio_data) as response:
22
+ result = await response.json()
23
+ if 'text' in result:
24
+ self.text += result['text'] + " "
25
+ st.text_area("Transcription", self.text, height=200)
26
 
27
+ class AudioTrack(MediaStreamTrack):
28
+ kind = "audio"
29
 
30
+ def __init__(self, track, transcriber):
31
+ super().__init__()
32
+ self.track = track
33
+ self.transcriber = transcriber
34
+
35
+ async def recv(self):
36
+ frame = await self.track.recv()
37
+ if len(self.transcriber.buffer) < 5: # Collect 5 seconds of audio before transcribing
38
+ self.transcriber.buffer.append(frame.to_ndarray())
39
+ else:
40
+ audio_data = b''.join([av.AudioFrame.from_ndarray(buf).to_bytes() for buf in self.transcriber.buffer])
41
+ asyncio.create_task(self.transcriber.transcribe(audio_data))
42
+ self.transcriber.buffer = []
43
+ return frame
44
+
45
+ async def process_offer(offer, transcriber):
46
+ pc = RTCPeerConnection()
47
+ pc.addTransceiver("audio", direction="recvonly")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
+ @pc.on("track")
50
+ def on_track(track):
51
+ if track.kind == "audio":
52
+ pc.addTrack(AudioTrack(track, transcriber))
53
 
54
+ await pc.setRemoteDescription(RTCSessionDescription(sdp=offer["sdp"], type=offer["type"]))
55
+ answer = await pc.createAnswer()
56
+ await pc.setLocalDescription(answer)
57
 
58
+ return {"sdp": pc.localDescription.sdp, "type": pc.localDescription.type}
 
 
 
 
 
59
 
60
+ st.title("Real-time Speech Recognition with Whisper")
61
+
62
+ webrtc_ctx = st.config.get_option("server.enableXsrfProtection")
63
+ if webrtc_ctx:
64
+ st.warning("To use this app, you need to disable XSRF protection. Set server.enableXsrfProtection=false in your Streamlit config.")
65
+ else:
66
+ offer = st.text_input("Paste the offer SDP here")
67
+ if offer:
68
+ transcriber = AudioTranscriber()
69
+ answer = asyncio.run(process_offer(json.loads(offer), transcriber))
70
+ st.text_area("Answer SDP", json.dumps(answer))
71
+ st.write("Speak into your microphone. The transcription will appear below.")
 
72
 
73
  st.markdown("---")
74
+ st.write("Note: This app uses the Whisper API from Hugging Face for real-time speech recognition.")