hassanaliemon/BanglaASR · Hugging Face

This is a fine-tuned whisper small model which is trained only on 23.8 hours of data and achieves 14.73 Word Error Rate which is impressive as it is trained on very little data. There is much more room to improve which will be done at near feautre. If anybody wants to collaborate, please be my guest. You can train the model on your own data using this link. And feel free to provide your feed back or connect with me at linkedin

import librosa
import torch
import torchaudio
import numpy as np

from transformers import WhisperTokenizer
from transformers import WhisperProcessor
from transformers import WhisperFeatureExtractor
from transformers import WhisperForConditionalGeneration

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

audio_path = "https://huggingface.co/hassanaliemon/BanglaASR/resolve/main/test_audio/common_voice_bn_31255511.mp3"
model_path = "hassanaliemon/BanglaASR"


feature_extractor = WhisperFeatureExtractor.from_pretrained(model_path)
tokenizer = WhisperTokenizer.from_pretrained(model_path)
processor = WhisperProcessor.from_pretrained(model_path)
model = WhisperForConditionalGeneration.from_pretrained(model_path).to(device)


speech_array, sampling_rate = torchaudio.load(audio_path, format="mp3")
speech_array = speech_array[0].numpy()
speech_array = librosa.resample(np.asarray(speech_array), orig_sr=sampling_rate, target_sr=16000)
input_features = feature_extractor(speech_array, sampling_rate=16000, return_tensors="pt").input_features

predicted_ids = model.generate(inputs=input_features.to(device))[0]
transcription = processor.decode(predicted_ids, skip_special_tokens=True)

print(transcription)