AmharicNLP / app.py
YosefAyele's picture
make it possible to record audio from within the app
a17627f
raw
history blame
2.29 kB
import gradio as gr
from speechbrain.inference.ASR import EncoderASR
from pydub import AudioSegment
import os
import numpy as np
from scipy.io.wavfile import write
# Load the ASR model
asr_model = EncoderASR.from_hparams(
source="YosefA/wave2vec2_amharic_stt",
savedir="pretrained_models/asr-wav2vec2-amharic"
)
# Directory to store converted audio files
os.makedirs("temp_audio", exist_ok=True)
def transcribe_audio(audio_file):
"""
Converts the uploaded audio to .wav format, runs transcription, and returns the result.
"""
# Save the uploaded audio to a temporary location
temp_audio_path = "temp_audio/input_audio.wav"
# Convert audio to .wav format
sound = AudioSegment.from_file(audio_file)
sound.export(temp_audio_path, format="wav")
# Transcribe the audio
transcription = asr_model.transcribe_file(temp_audio_path)
# Clean up temporary files (optional)
os.remove(temp_audio_path)
return transcription
def save_audio_to_file(audio_data, file_path="temp_audio/input_audio.wav"):
"""
Converts Gradio audio data (numpy array and sample rate) to a .wav file.
"""
audio_array, sample_rate = audio_data
# Normalize and save as .wav
os.makedirs(os.path.dirname(file_path), exist_ok=True)
write(file_path, sample_rate, np.int16(audio_array * 32767)) # Convert float32 to int16
return file_path
def process_audio(audio_data):
"""
Processes recorded/uploaded audio, saves it, and sends it to the transcribe_audio function.
"""
temp_audio_path = save_audio_to_file(audio_data)
transcription = transcribe_audio(temp_audio_path)
return transcription
# Define the Gradio interface
with gr.Blocks() as app:
gr.Markdown("### Amharic Speech-to-Text Transcription App")
gr.Markdown("Upload or record an audio file in any format, and get its transcription.")
with gr.Row():
audio_input = gr.Audio(label="Upload or Record Audio", type="filepath")
transcription_output = gr.Textbox(label="Transcription")
transcribe_button = gr.Button("Transcribe")
transcribe_button.click(transcribe_audio, inputs=audio_input, outputs=transcription_output)
# Launch the app
if __name__ == "__main__":
app.launch()