Spaces:

AAhad
/

thai-audio-to-text

Running

App Files Files Community

thai-audio-to-text / app.py

AAhad

bug fix

ed262e9 4 months ago

raw

history blame

4.87 kB

	import streamlit as st
	import soundfile as sf
	import librosa
	import numpy as np
	import time
	from transformers import pipeline
	from io import BytesIO
	import tempfile
	from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration
	import av
	import queue

	# Define the models
	MODELS = {
	"Whisper (English)": "openai/whisper-small.en",
	"Whisper (Multilingual)": "openai/whisper-small",
	"Facebook Wav2Vec2": "facebook/wav2vec2-large-960h",
	"Google Wav2Vec2": "google/wav2vec2-large-xlsr-53",
	"Whisper (Thai)": "openai/whisper-large"
	}

	# App UI
	st.title("Audio to Text Conversion")
	st.subheader("Select language and model")

	# Language selection
	language = st.selectbox("Choose Language", options=["English", "Thai"])

	# Model selection
	model_choice = st.selectbox("Choose a Model", options=list(MODELS.keys()))

	# Audio input options
	st.subheader("Record or Upload your audio")
	audio_option = st.radio("Choose an option:", ('Record Audio', 'Upload Audio'))

	audio_data = None

	# Queue to store recorded audio frames
	audio_queue = queue.Queue()

	# WebRTC Audio Recorder
	def audio_frame_callback(frame: av.AudioFrame):
	audio = frame.to_ndarray()
	audio_queue.put(audio)
	return frame

	# Option 1: Record audio via browser using WebRTC
	if audio_option == 'Record Audio':
	st.write("Click the button to start/stop recording.")

	# Change STUN server to a different one to avoid potential issues
	rtc_configuration = RTCConfiguration(
	{
	"iceServers": [
	{"urls": ["stun:stun1.l.google.com:19302"]},
	{"urls": ["stun:stun2.l.google.com:19302"]}
	]
	}
	)

	# Start WebRTC recording
	webrtc_ctx = webrtc_streamer(
	key="audio-stream",
	mode=WebRtcMode.SENDONLY,
	rtc_configuration=rtc_configuration,
	media_stream_constraints={"audio": True, "video": False},
	audio_frame_callback=audio_frame_callback,
	)

	# Ensure we are recording
	if webrtc_ctx.state.playing:
	st.write("Recording...")

	# Convert recorded audio frames to a numpy array for processing
	recorded_audio = []
	while not audio_queue.empty():
	recorded_audio.append(audio_queue.get())

	if recorded_audio:
	audio_data = np.concatenate(recorded_audio, axis=0)
	sr = 16000 # Assuming a standard sample rate for WebRTC

	# Compute audio properties
	audio_size = len(audio_data) * 2 # in bytes (16-bit PCM)
	duration = len(audio_data) / sr

	# Display audio properties
	st.write(f"Audio Size: {audio_size} bytes")
	st.write(f"Frame Rate: {sr} Hz")
	st.write(f"Duration: {duration:.2f} seconds")

	# Perform conversion using the selected model
	st.subheader("Converting audio to text...")

	start_time = time.time()

	# Load the model from HuggingFace
	model = pipeline("automatic-speech-recognition", model=MODELS[model_choice])

	# Perform the conversion
	audio_bytes = BytesIO(sf.write("temp.wav", audio_data, sr))
	result = model(audio_bytes)

	end_time = time.time()

	# Display results
	st.write("Transcription:", result['text'])
	st.write(f"Conversion took {end_time - start_time:.2f} seconds")

	# Option 2: Upload audio
	elif audio_option == 'Upload Audio':
	audio_file = st.file_uploader("Upload audio file (WAV format)", type=['wav'])

	if audio_file:
	# Load the audio file
	with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
	tmp_file.write(audio_file.read())
	tmp_file_path = tmp_file.name

	audio_data, sr = librosa.load(tmp_file_path, sr=None)

	# Compute audio properties
	audio_size = len(audio_data) * 2 # in bytes (16-bit PCM)
	frame_rate = sr
	duration = librosa.get_duration(y=audio_data, sr=sr)

	# Display audio properties
	st.write(f"Audio Size: {audio_size} bytes")
	st.write(f"Frame Rate: {frame_rate} Hz")
	st.write(f"Duration: {duration:.2f} seconds")

	# Perform conversion using the selected model
	st.subheader("Converting audio to text...")

	start_time = time.time()

	# Load the model from HuggingFace
	model = pipeline("automatic-speech-recognition", model=MODELS[model_choice])

	# Perform the conversion
	audio_bytes = BytesIO(sf.write(tmp_file_path, audio_data, sr))
	result = model(tmp_file_path)

	end_time = time.time()

	# Display results
	st.write("Transcription:", result['text'])
	st.write(f"Conversion took {end_time - start_time:.2f} seconds")

	else:
	st.write("Please select an audio input option.")