Spaces:
Running
Running
File size: 4,868 Bytes
3f2970b e16e61e 0c5ffc2 e16e61e 3f2970b ed262e9 3f2970b e16e61e 3f2970b e16e61e 3f2970b e16e61e ed262e9 0c5ffc2 ed262e9 0c5ffc2 ed262e9 e16e61e 0c5ffc2 e16e61e 3f2970b ed262e9 e16e61e 3f2970b e16e61e 3f2970b e16e61e 3f2970b e16e61e 3f2970b e16e61e 3f2970b e16e61e 3f2970b e16e61e 3f2970b e16e61e 3f2970b e16e61e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
import streamlit as st
import soundfile as sf
import librosa
import numpy as np
import time
from transformers import pipeline
from io import BytesIO
import tempfile
from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration
import av
import queue
# Define the models
MODELS = {
"Whisper (English)": "openai/whisper-small.en",
"Whisper (Multilingual)": "openai/whisper-small",
"Facebook Wav2Vec2": "facebook/wav2vec2-large-960h",
"Google Wav2Vec2": "google/wav2vec2-large-xlsr-53",
"Whisper (Thai)": "openai/whisper-large"
}
# App UI
st.title("Audio to Text Conversion")
st.subheader("Select language and model")
# Language selection
language = st.selectbox("Choose Language", options=["English", "Thai"])
# Model selection
model_choice = st.selectbox("Choose a Model", options=list(MODELS.keys()))
# Audio input options
st.subheader("Record or Upload your audio")
audio_option = st.radio("Choose an option:", ('Record Audio', 'Upload Audio'))
audio_data = None
# Queue to store recorded audio frames
audio_queue = queue.Queue()
# WebRTC Audio Recorder
def audio_frame_callback(frame: av.AudioFrame):
audio = frame.to_ndarray()
audio_queue.put(audio)
return frame
# Option 1: Record audio via browser using WebRTC
if audio_option == 'Record Audio':
st.write("Click the button to start/stop recording.")
# Change STUN server to a different one to avoid potential issues
rtc_configuration = RTCConfiguration(
{
"iceServers": [
{"urls": ["stun:stun1.l.google.com:19302"]},
{"urls": ["stun:stun2.l.google.com:19302"]}
]
}
)
# Start WebRTC recording
webrtc_ctx = webrtc_streamer(
key="audio-stream",
mode=WebRtcMode.SENDONLY,
rtc_configuration=rtc_configuration,
media_stream_constraints={"audio": True, "video": False},
audio_frame_callback=audio_frame_callback,
)
# Ensure we are recording
if webrtc_ctx.state.playing:
st.write("Recording...")
# Convert recorded audio frames to a numpy array for processing
recorded_audio = []
while not audio_queue.empty():
recorded_audio.append(audio_queue.get())
if recorded_audio:
audio_data = np.concatenate(recorded_audio, axis=0)
sr = 16000 # Assuming a standard sample rate for WebRTC
# Compute audio properties
audio_size = len(audio_data) * 2 # in bytes (16-bit PCM)
duration = len(audio_data) / sr
# Display audio properties
st.write(f"Audio Size: {audio_size} bytes")
st.write(f"Frame Rate: {sr} Hz")
st.write(f"Duration: {duration:.2f} seconds")
# Perform conversion using the selected model
st.subheader("Converting audio to text...")
start_time = time.time()
# Load the model from HuggingFace
model = pipeline("automatic-speech-recognition", model=MODELS[model_choice])
# Perform the conversion
audio_bytes = BytesIO(sf.write("temp.wav", audio_data, sr))
result = model(audio_bytes)
end_time = time.time()
# Display results
st.write("Transcription:", result['text'])
st.write(f"Conversion took {end_time - start_time:.2f} seconds")
# Option 2: Upload audio
elif audio_option == 'Upload Audio':
audio_file = st.file_uploader("Upload audio file (WAV format)", type=['wav'])
if audio_file:
# Load the audio file
with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
tmp_file.write(audio_file.read())
tmp_file_path = tmp_file.name
audio_data, sr = librosa.load(tmp_file_path, sr=None)
# Compute audio properties
audio_size = len(audio_data) * 2 # in bytes (16-bit PCM)
frame_rate = sr
duration = librosa.get_duration(y=audio_data, sr=sr)
# Display audio properties
st.write(f"Audio Size: {audio_size} bytes")
st.write(f"Frame Rate: {frame_rate} Hz")
st.write(f"Duration: {duration:.2f} seconds")
# Perform conversion using the selected model
st.subheader("Converting audio to text...")
start_time = time.time()
# Load the model from HuggingFace
model = pipeline("automatic-speech-recognition", model=MODELS[model_choice])
# Perform the conversion
audio_bytes = BytesIO(sf.write(tmp_file_path, audio_data, sr))
result = model(tmp_file_path)
end_time = time.time()
# Display results
st.write("Transcription:", result['text'])
st.write(f"Conversion took {end_time - start_time:.2f} seconds")
else:
st.write("Please select an audio input option.") |