import gradio as gr import nemo.collections.asr as nemo_asr import numpy as np import torch # Load the pre-trained Kabyle ASR model asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("nvidia/stt_kab_conformer_transducer_large") # Function to transcribe the audio input def transcribe(audio): # Print the raw audio input print(f"Raw audio input: {audio}") # Audio in Gradio is returned as a tuple (sample_rate, audio_data) sample_rate, audio_data = audio # Print to check the types print(f"Audio data type: {type(audio_data)}") print(f"Sample rate type: {type(sample_rate)}") # Ensure the audio data is in numpy array format if isinstance(audio_data, np.ndarray): # If it's already numpy, we pass it directly audio_data = np.array(audio_data) elif isinstance(audio_data, torch.Tensor): # If it's a tensor, convert to numpy array audio_data = audio_data.numpy() else: print("Error: Audio data is neither a numpy array nor a tensor.") return "Invalid audio format" # Now transcribe the audio return asr_model.transcribe([audio_data]) # Create the Gradio interface with audio input and text output iface = gr.Interface(fn=transcribe, inputs="audio", outputs="text") # Launch the Gradio interface iface.launch()