Spaces:
Sleeping
Sleeping
import gradio as gr | |
import nemo.collections.asr as nemo_asr | |
import numpy as np | |
import torch | |
# Load the pre-trained Kabyle ASR model | |
asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("nvidia/stt_kab_conformer_transducer_large") | |
# Function to transcribe the audio input | |
def transcribe(audio): | |
# Print the raw audio input | |
print(f"Raw audio input: {audio}") | |
# Audio in Gradio is returned as a tuple (sample_rate, audio_data) | |
sample_rate, audio_data = audio | |
# Print to check the types | |
print(f"Audio data type: {type(audio_data)}") | |
print(f"Sample rate type: {type(sample_rate)}") | |
# Ensure the audio data is in numpy array format | |
if isinstance(audio_data, np.ndarray): | |
# If it's already numpy, we pass it directly | |
audio_data = np.array(audio_data) | |
elif isinstance(audio_data, torch.Tensor): | |
# If it's a tensor, convert to numpy array | |
audio_data = audio_data.numpy() | |
else: | |
print("Error: Audio data is neither a numpy array nor a tensor.") | |
return "Invalid audio format" | |
# Now transcribe the audio | |
return asr_model.transcribe([audio_data]) | |
# Create the Gradio interface with audio input and text output | |
iface = gr.Interface(fn=transcribe, inputs="audio", outputs="text") | |
# Launch the Gradio interface | |
iface.launch() |